diff --git a/.github/workflows/ci-pfs.yml b/.github/workflows/ci-pfs.yml new file mode 100644 index 0000000..f06c170 --- /dev/null +++ b/.github/workflows/ci-pfs.yml @@ -0,0 +1,106 @@ +name: CI (PFS-MS) + +on: + push: + branches: [master] + pull_request: + branches: [master] + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: "-D warnings" + +defaults: + run: + working-directory: reference/PFS-MS-v1.0 + +jobs: + fmt: + name: rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - run: cargo fmt --all -- --check + + clippy: + name: clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy + - uses: Swatinem/rust-cache@v2 + with: + workspaces: reference/PFS-MS-v1.0 + - run: cargo clippy --all-targets --all-features -- -D warnings + + test: + name: test (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + with: + workspaces: reference/PFS-MS-v1.0 + - run: cargo build --verbose + - run: cargo test --all-targets --verbose + - run: cargo test --doc --verbose + + test-vector: + name: regenerate spec test vector + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + with: + workspaces: reference/PFS-MS-v1.0 + - name: Build and run the test-vector example + run: cargo run --example gen_testvector + - name: Inspect generated test vector + run: | + ls -l pfs_ms_testvector.bin + test "$(wc -c < pfs_ms_testvector.bin)" = "2986" + - uses: actions/upload-artifact@v4 + with: + name: pfs-ms-testvector + path: reference/PFS-MS-v1.0/pfs_ms_testvector.bin + + coverage: + name: code coverage + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: llvm-tools-preview + - uses: Swatinem/rust-cache@v2 + with: + workspaces: reference/PFS-MS-v1.0 + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + - name: Generate coverage report (lcov) + run: cargo llvm-cov --all-features --lcov --output-path lcov.info + - name: Print coverage summary + run: cargo llvm-cov report + # The `pfs` CLI binary is exercised manually, not by `cargo test`; the + # library floor is enforced over everything except the binary and examples. + - name: Enforce minimum library coverage + run: | + cargo llvm-cov report --summary-only \ + --ignore-filename-regex 'bin/|examples/' \ + --fail-under-lines 90 \ + --fail-under-functions 90 + - uses: actions/upload-artifact@v4 + with: + name: pfs-ms-coverage-lcov + path: reference/PFS-MS-v1.0/lcov.info diff --git a/Cargo.toml b/Cargo.toml index 1e78542..3a80a32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,3 @@ [workspace] resolver = "2" -members = ["reference/PCF-v1.0", "tools/pcf-debug"] +members = ["reference/PCF-v1.0", "reference/PFS-MS-v1.0", "tools/pcf-debug"] diff --git a/implementations/dotnet/src/Pcf/BlockView.cs b/implementations/dotnet/src/Pcf/BlockView.cs new file mode 100644 index 0000000..aeb03ce --- /dev/null +++ b/implementations/dotnet/src/Pcf/BlockView.cs @@ -0,0 +1,35 @@ +using System.Collections.Generic; + +namespace Pcf; + +/// +/// One table block read from disk: its absolute , its parsed +/// (including table_hash and +/// next_table_offset), and its list. +/// +/// +/// This is a read-only view returned by . It +/// exists so that profiles layered on PCF (which must group blocks, inspect each +/// block's table_hash, and follow non-default next_table_offset +/// chains) can reuse PCF's block parsing rather than re-decoding raw bytes. It +/// plays no part in the writer's in-memory bookkeeping. +/// +public sealed class BlockView +{ + /// Absolute file offset of the table block. + public ulong Offset { get; } + + /// Parsed 74-byte block header. + public TableBlockHeader Header { get; } + + /// The block's entries, in stored order. + public IReadOnlyList Entries { get; } + + /// Create a block view. + public BlockView(ulong offset, TableBlockHeader header, IReadOnlyList entries) + { + Offset = offset; + Header = header; + Entries = entries; + } +} diff --git a/implementations/dotnet/src/Pcf/Container.cs b/implementations/dotnet/src/Pcf/Container.cs index 138fe8e..1247487 100644 --- a/implementations/dotnet/src/Pcf/Container.cs +++ b/implementations/dotnet/src/Pcf/Container.cs @@ -227,6 +227,20 @@ public List Entries() return outp; } + /// + /// Read a single table block at an absolute , + /// returning its parsed header (including table_hash) and entries. + /// Unlike , which flattens the whole chain, this exposes + /// one block at a time so a caller can follow an arbitrary + /// next_table_offset chain and inspect each block's table_hash. + /// It is a read-only operation and does not alter the container. + /// + public BlockView ReadBlockAt(ulong offset) + { + (TableBlockHeader h, List entries) = ReadBlock(offset); + return new BlockView(offset, h, entries); + } + /// Read a partition's used data. public byte[] ReadPartitionData(PartitionEntry entry) { diff --git a/implementations/dotnet/tests/Pcf.Tests/RoundtripTests.cs b/implementations/dotnet/tests/Pcf.Tests/RoundtripTests.cs index abdd367..f77b3b1 100644 --- a/implementations/dotnet/tests/Pcf.Tests/RoundtripTests.cs +++ b/implementations/dotnet/tests/Pcf.Tests/RoundtripTests.cs @@ -132,4 +132,35 @@ public void Overflow_chain_roundtrips() reopened.Verify(); Assert.Equal(5, reopened.Entries().Count); } + + [Fact] + public void ReadBlockAt_exposes_block_view() + { + // A first-block capacity of 2 forces a second (overflow) block for 3 + // partitions, so we can walk the chain block-by-block via ReadBlockAt. + var c = Container.CreateWith(new MemoryStream(), 2, HashAlgo.Sha256); + for (byte i = 1; i <= 3; i++) + { + c.AddPartition(i, TestSupport.Uid(i), $"p{i}", + new byte[] { i, i, i, i }, 0, HashAlgo.Sha256); + } + + ulong off = c.Header.PartitionTableOffset; + int total = 0, blocks = 0; + while (off != 0) + { + BlockView view = c.ReadBlockAt(off); + Assert.Equal(off, view.Offset); + Assert.Equal((int)view.Header.PartitionCount, view.Entries.Count); + // The exposed table_hash must match a recomputation over the block. + byte[] recomputed = TableBlockHeader.ComputeTableHash( + view.Header.TableHashAlgo, view.Header.NextTableOffset, view.Entries); + Assert.Equal(recomputed, view.Header.TableHash); + total += view.Entries.Count; + blocks++; + off = view.Header.NextTableOffset; + } + Assert.Equal(3, total); + Assert.Equal(2, blocks); + } } diff --git a/implementations/php/src/BlockView.php b/implementations/php/src/BlockView.php new file mode 100644 index 0000000..bf74b52 --- /dev/null +++ b/implementations/php/src/BlockView.php @@ -0,0 +1,28 @@ +readBlock($offset); + + return new BlockView($offset, $h, $entries); + } + /** Read a partition's used data. */ public function readPartitionData(PartitionEntry $entry): string { diff --git a/implementations/php/tests/RoundtripTest.php b/implementations/php/tests/RoundtripTest.php index 5071f63..1c3cf3f 100644 --- a/implementations/php/tests/RoundtripTest.php +++ b/implementations/php/tests/RoundtripTest.php @@ -10,6 +10,7 @@ use Kduma\PCF\HashAlgo; use Kduma\PCF\Storage\MemoryStorage; use Kduma\PCF\Storage\StreamStorage; +use Kduma\PCF\TableBlockHeader; /** * End-to-end container tests, porting `roundtrip.rs` and `coverage.rs`. @@ -265,4 +266,35 @@ public function testCompactIntoWritesImage(): void $c->compactInto($out); self::assertSame($c->compactedImage(), $out->getContents()); } + + public function testReadBlockAtExposesBlockView(): void + { + // First block capacity of 2 forces a second (overflow) block for 3 + // partitions, so we can walk the chain block-by-block via readBlockAt. + $c = Container::createWith(new MemoryStorage(), 2, HashAlgo::Sha256); + for ($i = 1; $i <= 3; ++$i) { + $c->addPartition($i, self::uid($i), "p{$i}", str_repeat(\chr($i), 4), 0, HashAlgo::Sha256); + } + + $off = $c->header()->partitionTableOffset; + $total = 0; + $blocks = 0; + while ($off !== 0) { + $view = $c->readBlockAt($off); + self::assertSame($off, $view->offset); + self::assertCount($view->header->partitionCount, $view->entries); + // The exposed table_hash must match a recomputation over the block. + $recomputed = TableBlockHeader::computeTableHash( + $view->header->tableHashAlgo, + $view->header->nextTableOffset, + $view->entries, + ); + self::assertSame($recomputed, $view->header->tableHash); + $total += \count($view->entries); + ++$blocks; + $off = $view->header->nextTableOffset; + } + self::assertSame(3, $total); + self::assertSame(2, $blocks); + } } diff --git a/implementations/ts/src/container.ts b/implementations/ts/src/container.ts index 226fa20..d7429bc 100644 --- a/implementations/ts/src/container.ts +++ b/implementations/ts/src/container.ts @@ -68,6 +68,25 @@ interface BlockInfo { next: number; } +/** + * One table block read from disk: its absolute `offset`, its parsed + * {@link TableBlockHeader} (including `tableHash` and `nextTableOffset`), and + * its {@link PartitionEntry} list. + * + * Returned by {@link Container.readBlockAt}. It lets code layered on PCF group + * blocks, inspect each block's `tableHash`, and follow non-default + * `nextTableOffset` chains, instead of {@link Container.entries} which flattens + * the whole chain. + */ +export interface BlockView { + /** Absolute file offset of the table block. */ + offset: number; + /** Parsed 74-byte block header. */ + header: TableBlockHeader; + /** The block's entries, in stored order. */ + entries: PartitionEntry[]; +} + function bytesEqual(a: Uint8Array, b: Uint8Array): boolean { if (a.length !== b.length) { return false; @@ -242,6 +261,18 @@ export class Container { return out; } + /** + * Read a single table block at an absolute `offset`, returning its parsed + * header (including `tableHash`) and entries. Unlike {@link entries}, which + * flattens the whole chain, this exposes one block at a time so a caller can + * follow an arbitrary `nextTableOffset` chain and inspect each block's + * `tableHash`. It is a read-only operation and does not alter the container. + */ + readBlockAt(offset: number): BlockView { + const [header, entries] = this.readBlock(offset); + return { offset, header, entries }; + } + /** Read a partition's used data. */ readPartitionData(entry: PartitionEntry): Uint8Array { const used = Number(entry.usedBytes); diff --git a/implementations/ts/src/index.ts b/implementations/ts/src/index.ts index d80532a..fa245ba 100644 --- a/implementations/ts/src/index.ts +++ b/implementations/ts/src/index.ts @@ -65,4 +65,4 @@ export { } from "./table.js"; export { type Storage, MemoryStorage } from "./storage.js"; export { NodeFileStorage } from "./node-storage.js"; -export { Container } from "./container.js"; +export { Container, type BlockView } from "./container.js"; diff --git a/implementations/ts/test/roundtrip.test.ts b/implementations/ts/test/roundtrip.test.ts index 90b50c9..131cdf8 100644 --- a/implementations/ts/test/roundtrip.test.ts +++ b/implementations/ts/test/roundtrip.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it } from "vitest"; import { + computeTableHash, Container, entryLabelString, freeBytes, @@ -150,4 +151,34 @@ describe("roundtrip", () => { c2.verify(); expect(c2.entries().length).toBe(1); }); + + it("readBlockAt exposes a block view", () => { + // First block capacity of 2 forces a second (overflow) block for 3 + // partitions, so we can walk the chain block-by-block via readBlockAt. + const c = Container.createWith(new MemoryStorage(), 2, HashAlgo.Sha256); + for (let i = 1; i <= 3; i++) { + c.addPartition(i, uid(i), `p${i}`, new Uint8Array([i, i, i, i]), 0, HashAlgo.Sha256); + } + + let off = Number(c.header().partitionTableOffset); + let total = 0; + let blocks = 0; + while (off !== 0) { + const view = c.readBlockAt(off); + expect(view.offset).toBe(off); + expect(view.entries.length).toBe(view.header.partitionCount); + // The exposed tableHash must match a recomputation over the block. + const recomputed = computeTableHash( + view.header.tableHashAlgo, + view.header.nextTableOffset, + view.entries, + ); + expect(recomputed).toEqual(view.header.tableHash); + total += view.entries.length; + blocks++; + off = Number(view.header.nextTableOffset); + } + expect(total).toBe(3); + expect(blocks).toBe(2); + }); }); diff --git a/reference/PCF-v1.0/src/container.rs b/reference/PCF-v1.0/src/container.rs index 0429c7e..0b537a6 100644 --- a/reference/PCF-v1.0/src/container.rs +++ b/reference/PCF-v1.0/src/container.rs @@ -46,6 +46,25 @@ struct BlockInfo { next: u64, } +/// One table block as read from disk: its absolute `offset`, its parsed +/// [`TableBlockHeader`] (including `table_hash` and `next_table_offset`), and +/// its [`PartitionEntry`] list. +/// +/// This is a read-only view returned by [`Container::read_block_at`]. It exists +/// so that profiles layered on PCF (which must group blocks, inspect each +/// block's `table_hash`, and follow non-default `next_table_offset` chains) can +/// reuse PCF's block parsing and verification rather than re-decoding raw +/// bytes. It plays no part in the writer's in-memory bookkeeping. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BlockView { + /// Absolute file offset of the table block. + pub offset: u64, + /// Parsed 74-byte block header. + pub header: TableBlockHeader, + /// The block's entries, in stored order. + pub entries: Vec, +} + /// A PCF container backed by `S`. pub struct Container { storage: S, @@ -220,6 +239,22 @@ impl Container { Ok(out) } + /// Read a single table block at an absolute `offset`, returning its parsed + /// header (including `table_hash`) and entries as a [`BlockView`]. + /// + /// Unlike [`Self::entries`], which flattens the whole chain, this exposes + /// one block at a time so a caller can follow an arbitrary + /// `next_table_offset` chain and inspect each block's `table_hash`. It is a + /// read-only operation and does not alter the container. + pub fn read_block_at(&mut self, offset: u64) -> Result { + let (header, entries) = self.read_block(offset)?; + Ok(BlockView { + offset, + header, + entries, + }) + } + /// Read a partition's used data. pub fn read_partition_data(&mut self, entry: &PartitionEntry) -> Result> { let mut buf = vec![0u8; entry.used_bytes as usize]; diff --git a/reference/PCF-v1.0/src/lib.rs b/reference/PCF-v1.0/src/lib.rs index 8155c73..1832365 100644 --- a/reference/PCF-v1.0/src/lib.rs +++ b/reference/PCF-v1.0/src/lib.rs @@ -47,7 +47,7 @@ mod header; mod table; pub use consts::*; -pub use container::Container; +pub use container::{BlockView, Container}; pub use entry::{decode_label, encode_label, PartitionEntry}; pub use error::{Error, Result}; pub use hash::HashAlgo; diff --git a/reference/PCF-v1.0/tests/roundtrip.rs b/reference/PCF-v1.0/tests/roundtrip.rs index 0eef4dd..0a4bdad 100644 --- a/reference/PCF-v1.0/tests/roundtrip.rs +++ b/reference/PCF-v1.0/tests/roundtrip.rs @@ -2,7 +2,7 @@ use std::io::Cursor; -use pcf::{Container, Error, HashAlgo}; +use pcf::{compute_table_hash, Container, Error, HashAlgo}; fn uid(n: u8) -> [u8; 16] { let mut u = [0u8; 16]; @@ -136,6 +136,46 @@ fn overflow_chain() { } } +#[test] +fn read_block_at_exposes_block_view() { + // A first-block capacity of 2 forces a second (overflow) block for 3 + // partitions, so we can walk the chain block-by-block via read_block_at. + let mut c = Container::create_with(Cursor::new(Vec::new()), 2, HashAlgo::Sha256).unwrap(); + for i in 1..=3u8 { + c.add_partition( + i as u32, + uid(i), + &format!("p{i}"), + &[i; 4], + 0, + HashAlgo::Sha256, + ) + .unwrap(); + } + + // Walk the chain using only the public block-level API. + let mut off = c.header().partition_table_offset; + let mut total = 0usize; + let mut blocks = 0usize; + while off != 0 { + let view = c.read_block_at(off).unwrap(); + assert_eq!(view.offset, off); + assert_eq!(view.header.partition_count as usize, view.entries.len()); + // The exposed table_hash must match a recomputation over the block. + let recomputed = compute_table_hash( + view.header.table_hash_algo, + view.header.next_table_offset, + &view.entries, + ); + assert_eq!(view.header.table_hash, recomputed); + total += view.entries.len(); + blocks += 1; + off = view.header.next_table_offset; + } + assert_eq!(total, 3); + assert_eq!(blocks, 2); +} + #[test] fn duplicate_uid_rejected() { let mut c = Container::create(Cursor::new(Vec::new())).unwrap(); diff --git a/reference/PFS-MS-v1.0/.gitignore b/reference/PFS-MS-v1.0/.gitignore new file mode 100644 index 0000000..abec85a --- /dev/null +++ b/reference/PFS-MS-v1.0/.gitignore @@ -0,0 +1,26 @@ +# --- Rust / Cargo --- +/target +Cargo.lock +**/*.rs.bk + +# --- Generated test vector --- +pfs_ms_testvector.bin + +# --- JetBrains (RustRover / IntelliJ) --- +.idea/ +*.iml +*.iws +*.ipr +out/ + +# --- macOS --- +.DS_Store +.AppleDouble +.LSOverride +._* + +# --- Editors --- +*.swp +*.swo +*~ +.vscode/ diff --git a/reference/PFS-MS-v1.0/Cargo.toml b/reference/PFS-MS-v1.0/Cargo.toml new file mode 100644 index 0000000..3752e9d --- /dev/null +++ b/reference/PFS-MS-v1.0/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "pfs-ms" +version = "1.0.0" +edition = "2021" +description = "Reference implementation of PFS-MS v1.0, the PCF File System Multi-Session profile" +license = "MIT OR Apache-2.0" +repository = "https://example.invalid/pfs-ms" +readme = "README.md" + +# This crate is a *reference* implementation of the PFS-MS profile. Like the +# `pcf` crate it builds on, it favours a direct, auditable mapping onto the +# written specification (`specs/PFS-MS-spec-v1.0.txt`) over raw performance. + +[[bin]] +name = "pfs" +path = "src/bin/pfs.rs" + +[dependencies] +# The PFS-MS profile is layered strictly above PCF v1.0; every byte container +# operation goes through the reference PCF crate. +pcf = { path = "../PCF-v1.0" } + +# VCDIFF (RFC 3284) is the required delta algorithm (patch_algo_id = 1). +# `oxidelta` is a pure-Rust encoder/decoder; default features (CLI, file-io, +# lzma/zlib secondary compressors) are disabled to keep the dependency minimal. +oxidelta = { version = "=0.1.4", default-features = false } + +# 16-byte node_id / PCF uid generation (UUIDv7 recommended by both specs). +uuid = { version = "1", features = ["v7"] } + +# DEFLATE (RFC 1951) is the required content-compression algorithm +# (compression_algo_id = 1). `flate2`'s pure-Rust miniz_oxide backend keeps the +# crate free of C dependencies and fully portable. +flate2 = { version = "1", default-features = false, features = ["rust_backend"] } + +# Portable file modification-time setting for the directory-import/extract tools. +filetime = "0.2" diff --git a/reference/PFS-MS-v1.0/README.md b/reference/PFS-MS-v1.0/README.md new file mode 100644 index 0000000..990377a --- /dev/null +++ b/reference/PFS-MS-v1.0/README.md @@ -0,0 +1,188 @@ +# pfs-ms — PFS-MS v1.0 (reference implementation) + +Reference reader/writer for **PFS-MS v1.0** (PCF File System, Multi-Session +Profile): an append-only, multi-session tree of files and directories stored +inside a single **PCF v1.0** file. + +This crate mirrors the written specification (`specs/PFS-MS-spec-v1.0.txt`) +field-for-field and builds entirely on the [`pcf`](../PCF-v1.0) reference crate. +It favours auditability over performance. + +PFS-MS is layered *strictly above* PCF: **a PFS-MS file is a fully conforming +PCF file**. A generic PCF reader sees a valid flat set of partitions and +verifies every `table_hash`/`data_hash`; it simply does not reconstruct the +tree. PFS-MS adds no new container mechanics — it uses two application partition +types, the PCF RAW type, PCF's flexible `next_table_offset`, and the single +in-place header-pointer rewrite that PCF already permits. + +## Model + +* File **content** lives in PCF **RAW** partitions (`0xFFFFFFFF`): either the + full bytes (DIRECT) or a VCDIFF patch (DELTA) against the previous version, + in either case **optionally compressed** (see below). +* **Node** metadata lives in **PFS_NODE** partitions (`0xAAAA0001`): one + declarative snapshot of a file/directory per session it changed in. +* **Session** metadata lives in **PFS_SESSION** partitions (`0xAAAA0002`): one + per session, carrying the inter-session hash chain. + +Each session appends new bytes and **backward-links** its Table Block(s) to the +previous session's HEAD block (newest → oldest). Committing a session writes all +data and blocks beyond the live chain, then atomically rewrites the 8-byte +`partition_table_offset` in the PCF header — the only in-place mutation +(Section 4.3). A reader walks the chain from the head, groups blocks into +sessions, and resolves the newest record per node (newest wins). + +``` +header.partition_table_offset --> HEAD(newest) --> ... --> HEAD(oldest) --> 0 +``` + +## Partition types and magics + +| value | name | data | +|--------------|-------------|----------------------------------------| +| `0xAAAA0001` | PFS_NODE | one Node Record (magic `"PFSN"`) | +| `0xAAAA0002` | PFS_SESSION | one Session Record (magic `"PFSS"`) | +| `0xFFFFFFFF` | RAW | file content: full bytes or a patch | + +## Compression (Section 9.5) + +The bytes stored in a DIRECT content partition (the full content) or a DELTA +patch partition (the patch) may be compressed. The content section carries a +`compression_algo_id`; DIRECT is 91 bytes and DELTA is 165 bytes (one byte more +than the uncompressed-only draft). The writer DEFLATEs the bytes and stores the +compressed form only when it is smaller, else stores them verbatim. + +| id | algorithm | notes | +|----|-----------|-------| +| 0 | none | stored verbatim (required) | +| 1 | DEFLATE | RFC 1951, the required default (pure-Rust `flate2`/miniz_oxide) | +| 2 | zstd | reserved | +| 3 | brotli | reserved | + +Integrity layers cleanly: the PCF `data_hash` protects the **stored +(compressed)** bytes; `full_hash`/`full_size` protect the **decompressed** +content. An unknown `compression_algo_id` makes a file *unreadable* but not the +container *malformed* (the same rule as an unknown `patch_algo_id`). + +> This revision changes the v1.0 content-section layout and is intentionally +> **not** compatible with files written by earlier drafts. + +## Library usage + +```rust +use std::io::Cursor; +use pcf::HashAlgo; +use pfs_ms::{FsReader, FsWriter}; + +let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256)?; +w.mkdir("docs")?; +w.put_file("docs/hello.txt", b"Hello\n")?; // DIRECT +w.put_file("docs/hello.txt", b"Hello, world\n")?; // DELTA (auto) +let bytes = w.into_storage().into_inner(); + +let mut r = FsReader::open(Cursor::new(bytes))?; +r.verify()?; // incl. inter-session chain +assert_eq!(r.read_path("docs/hello.txt")?, b"Hello, world\n"); +// History query "as of" an earlier session (Section 15): +assert_eq!(r.read_path_as_of("docs/hello.txt", Some(2))?, b"Hello\n"); +# Ok::<(), pfs_ms::Error>(()) +``` + +`FsReader`/`FsWriter` work with any `Read + Write + Seek` backing store +(`std::fs::File`, `std::io::Cursor>`, …). VCDIFF (RFC 3284) deltas are +provided by the pure-Rust [`oxidelta`](https://crates.io/crates/oxidelta) crate +and DEFLATE compression by [`flate2`](https://crates.io/crates/flate2) +(miniz_oxide backend); node/uid identities use UUIDv7. + +## CLI + +A small demo CLI (`pfs`) drives whole sessions end to end: + +``` +cargo run --bin pfs -- mkfs fs.pfs +cargo run --bin pfs -- mkdir fs.pfs docs +echo hi | cargo run --bin pfs -- put fs.pfs docs/hello.txt - +cargo run --bin pfs -- put fs.pfs docs/hello.txt ./bigger.bin +cargo run --bin pfs -- put fs.pfs docs/raw.bin ./data.bin --store # no compression +cargo run --bin pfs -- mv fs.pfs docs documents +cargo run --bin pfs -- rm fs.pfs documents/hello.txt +cargo run --bin pfs -- ls fs.pfs +cargo run --bin pfs -- log fs.pfs +cargo run --bin pfs -- verify fs.pfs +``` + +### Directory commands + +Whole-directory import/export. Each `create` and `update` is committed as a +**single session** (one "burn"), not one session per file. + +``` +# Create a new archive from a directory tree (fails if the archive exists). +cargo run --bin pfs -- create backup.pfs ./project + +# Update it from the directory: add new files, update changed ones. With +# --delete it mirrors (tombstones archive entries no longer in the source). +cargo run --bin pfs -- update backup.pfs ./project +cargo run --bin pfs -- update backup.pfs ./project --delete + +# Extract the whole tree to a directory, optionally at a point in time. +cargo run --bin pfs -- extract backup.pfs ./restore +cargo run --bin pfs -- extract backup.pfs ./restore --at 2 # by session_seq +cargo run --bin pfs -- extract backup.pfs ./restore --at-time 1700000000000 +``` + +POSIX permission bits and modification time are captured on import and restored +on extract; pass `--no-metadata` (on either side) to skip this, and `--store` to +disable compression. Symlinks and other non-regular files are skipped with a +warning. + +## Layout + +``` +reference/PFS-MS-v1.0/ +├── Cargo.toml +├── src/ +│ ├── lib.rs # crate root + re-exports +│ ├── consts.rs # on-disk constants (Appendix B) +│ ├── node.rs # PFS_NODE record + content sections (Section 7) +│ ├── session.rs # PFS_SESSION record + hash-chain helpers (Section 8) +│ ├── delta.rs # VCDIFF wrapper (Section 9.2) +│ ├── compress.rs # DEFLATE wrapper + registry (Section 9.5) +│ ├── writer.rs # append-only session writer (Sections 4, 6, 12) +│ ├── reader.rs # backward-chain scan + node view (Sections 8, 10, 11) +│ ├── tree.rs # liveness, tree, reconstruction (Sections 9.3, 10) +│ ├── fs.rs # high-level FsReader +│ ├── dirsync.rs # directory <-> archive tooling (create/update/extract) +│ ├── vector.rs # canonical Section 17 reference vector +│ └── bin/pfs.rs # demo CLI +├── tests/ +│ ├── roundtrip.rs # end-to-end black-box tests +│ ├── coverage.rs # targeted error-path / edge-case tests +│ ├── dirsync.rs # directory create/update/extract round-trips +│ └── spec_compliance.rs # one test per normative MUST (R1..R8, W2/W3) +└── examples/ + └── gen_testvector.rs # writes pfs_ms_testvector.bin + hex dumps +``` + +## Tests + +``` +cargo test # unit + integration + doc tests +cargo run --example gen_testvector # writes pfs_ms_testvector.bin (2986 bytes) +cargo llvm-cov --ignore-filename-regex 'bin/|examples/' # library coverage +``` + +CI (`.github/workflows/ci-pfs.yml`) runs `cargo fmt --check`, `cargo clippy -D +warnings`, `cargo test` on Linux/macOS/Windows, the test-vector example, and +`cargo llvm-cov` with a library line/function floor (the `pfs` CLI is exercised +manually, so it is excluded from the coverage gate). + +## Relationship to PCF + +This crate uses only the **public** PCF primitives — `FileHeader`, +`TableBlockHeader`, `PartitionEntry`, `compute_table_hash`, `HashAlgo`, +`encode_label`, and `Container::read_block_at` (a read-only per-block walker). +It never uses PCF's in-place `Container` *writer*, because PFS-MS requires +backward-linked blocks and a single header-pointer rewrite at commit. The only +addition made to the PCF crate for this profile is the additive, read-only +`read_block_at`/`BlockView` API. diff --git a/reference/PFS-MS-v1.0/examples/gen_testvector.rs b/reference/PFS-MS-v1.0/examples/gen_testvector.rs new file mode 100644 index 0000000..ccdf545 --- /dev/null +++ b/reference/PFS-MS-v1.0/examples/gen_testvector.rs @@ -0,0 +1,73 @@ +//! Generate the canonical PFS-MS reference test vector for the Section 17 +//! scenario, write it to `pfs_ms_testvector.bin`, and print a hex dump plus the +//! key hashes — mirroring `pcf`'s `gen_testvector` example. +//! +//! Run with: `cargo run --example gen_testvector` + +use std::io::Cursor; + +use pcf::{Container, HashAlgo}; +use pfs_ms::{build_reference_vector, FsReader, PFS_NODE_TYPE, PFS_SESSION_TYPE}; + +fn hexdump(bytes: &[u8]) { + for (i, chunk) in bytes.chunks(16).enumerate() { + let hex: Vec = chunk.iter().map(|b| format!("{b:02X}")).collect(); + let ascii: String = chunk + .iter() + .map(|&b| { + if (0x20..0x7F).contains(&b) { + b as char + } else { + '.' + } + }) + .collect(); + println!("{:04X} {:<48} {}", i * 16, hex.join(" "), ascii); + } +} + +fn main() { + let bytes = build_reference_vector().expect("build vector"); + + std::fs::write("pfs_ms_testvector.bin", &bytes).expect("write file"); + println!("wrote pfs_ms_testvector.bin ({} bytes)\n", bytes.len()); + + println!( + "SHA-256(file) = {}", + hex(&HashAlgo::Sha256.compute(&bytes)[..32]) + ); + println!(); + + println!("==== full file hex dump ===="); + hexdump(&bytes); + println!(); + + // Dump each record type by reading the partitions back out via PCF. + let mut c = Container::open(Cursor::new(bytes.clone())).expect("pcf open"); + c.verify().expect("pcf verify"); + for e in c.entries().expect("entries") { + let data = c.read_partition_data(&e).expect("data"); + let label = match e.partition_type { + t if t == PFS_NODE_TYPE => "PFS_NODE", + t if t == PFS_SESSION_TYPE => "PFS_SESSION", + _ => "RAW", + }; + println!( + "---- {label} (uid {}, {} bytes, data_hash {}) ----", + hex(&e.uid), + data.len(), + hex(&e.data_hash[..32]) + ); + hexdump(&data); + println!(); + } + + // Confirm the vector reconstructs. + let mut r = FsReader::open(Cursor::new(bytes)).expect("pfs open"); + r.verify().expect("pfs verify"); + println!("reconstruction verified OK"); +} + +fn hex(bytes: &[u8]) -> String { + bytes.iter().map(|b| format!("{b:02x}")).collect() +} diff --git a/reference/PFS-MS-v1.0/src/bin/pfs.rs b/reference/PFS-MS-v1.0/src/bin/pfs.rs new file mode 100644 index 0000000..a4e82e9 --- /dev/null +++ b/reference/PFS-MS-v1.0/src/bin/pfs.rs @@ -0,0 +1,310 @@ +//! `pfs` — a small demo CLI for the PFS-MS reference implementation. +//! +//! Each mutating subcommand opens the file, commits exactly one session, and +//! flushes; read subcommands reconstruct the filesystem at the head. +//! +//! ```text +//! pfs mkfs +//! pfs mkdir +//! pfs put [] [--store] # default reads stdin +//! pfs mv +//! pfs rm +//! pfs ls [] +//! pfs cat +//! pfs get +//! pfs log +//! pfs verify +//! pfs create [--store] [--no-metadata] +//! pfs update [--delete] [--store] [--no-metadata] +//! pfs extract [--at ] [--at-time ] [--no-metadata] +//! ``` + +use std::collections::{HashMap, HashSet}; +use std::fs::{File, OpenOptions}; +use std::io::{Read, Write}; +use std::path::Path; +use std::process::ExitCode; + +use pcf::HashAlgo; +use pfs_ms::{FsReader, FsWriter, SyncOptions, Tree, ROOT_NODE_ID}; + +type CliResult = Result<(), String>; + +fn main() -> ExitCode { + let args: Vec = std::env::args().skip(1).collect(); + match run(&args) { + Ok(()) => ExitCode::SUCCESS, + Err(msg) => { + eprintln!("pfs: {msg}"); + ExitCode::FAILURE + } + } +} + +fn run(args: &[String]) -> CliResult { + let cmd = args.first().map(|s| s.as_str()).unwrap_or(""); + let rest = if args.is_empty() { + &args[0..0] + } else { + &args[1..] + }; + match cmd { + "mkfs" => cmd_mkfs(rest), + "mkdir" => cmd_mkdir(rest), + "put" => cmd_put(rest), + "mv" => cmd_mv(rest), + "rm" => cmd_rm(rest), + "ls" => cmd_ls(rest), + "cat" => cmd_cat(rest), + "get" => cmd_get(rest), + "log" => cmd_log(rest), + "verify" => cmd_verify(rest), + "create" => cmd_create(rest), + "update" => cmd_update(rest), + "extract" => cmd_extract(rest), + "" | "help" | "-h" | "--help" => { + print_usage(); + Ok(()) + } + other => Err(format!("unknown command '{other}' (try `pfs help`)")), + } +} + +fn print_usage() { + eprintln!( + "usage:\n pfs mkfs \n pfs mkdir \n pfs put [] [--store]\n pfs mv \n pfs rm \n pfs ls []\n pfs cat \n pfs get \n pfs log \n pfs verify \n pfs create [--store] [--no-metadata]\n pfs update [--delete] [--store] [--no-metadata]\n pfs extract [--at ] [--at-time ] [--no-metadata]" + ); +} + +fn arg<'a>(args: &'a [String], i: usize, what: &str) -> Result<&'a str, String> { + args.get(i) + .map(|s| s.as_str()) + .ok_or_else(|| format!("missing argument: {what}")) +} + +/// Parsed command line: positionals, boolean flags, and `--flag value` pairs. +struct Parsed { + positional: Vec, + flags: HashSet, + values: HashMap, +} + +/// Split `args` into positionals, boolean flags, and value flags. Any flag in +/// `value_flags` consumes the following token as its value. +fn parse_flags(args: &[String], value_flags: &[&str]) -> Result { + let mut p = Parsed { + positional: Vec::new(), + flags: HashSet::new(), + values: HashMap::new(), + }; + let mut i = 0; + while i < args.len() { + let a = &args[i]; + if let Some(name) = a.strip_prefix("--") { + if value_flags.contains(&name) { + let v = args + .get(i + 1) + .ok_or_else(|| format!("flag --{name} needs a value"))?; + p.values.insert(name.to_string(), v.clone()); + i += 2; + } else { + p.flags.insert(name.to_string()); + i += 1; + } + } else { + p.positional.push(a.clone()); + i += 1; + } + } + Ok(p) +} + +fn open_rw(path: &str) -> Result { + OpenOptions::new() + .read(true) + .write(true) + .open(path) + .map_err(|e| format!("cannot open '{path}': {e}")) +} + +fn open_writer(path: &str) -> Result, String> { + FsWriter::open(open_rw(path)?).map_err(|e| e.to_string()) +} + +fn open_reader(path: &str) -> Result, String> { + FsReader::open(open_rw(path)?).map_err(|e| e.to_string()) +} + +fn cmd_mkfs(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let f = File::create(file).map_err(|e| format!("cannot create '{file}': {e}"))?; + FsWriter::mkfs(f, HashAlgo::Sha256).map_err(|e| e.to_string())?; + Ok(()) +} + +fn cmd_mkdir(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let path = arg(a, 1, "")?; + open_writer(file)?.mkdir(path).map_err(|e| e.to_string()) +} + +fn cmd_put(a: &[String]) -> CliResult { + // `--store` (anywhere after the file) disables compression for this write. + let store = a.iter().any(|s| s == "--store"); + let positional: Vec<&str> = a + .iter() + .map(|s| s.as_str()) + .filter(|s| *s != "--store") + .collect(); + let file = positional + .first() + .copied() + .ok_or("missing argument: ")?; + let path = positional + .get(1) + .copied() + .ok_or("missing argument: ")?; + let src = positional.get(2).copied().unwrap_or("-"); + let data = if src == "-" { + let mut buf = Vec::new(); + std::io::stdin() + .read_to_end(&mut buf) + .map_err(|e| e.to_string())?; + buf + } else { + std::fs::read(src).map_err(|e| format!("cannot read '{src}': {e}"))? + }; + let mut w = open_writer(file)?; + w.set_compression(!store); + w.put_file(path, &data).map_err(|e| e.to_string()) +} + +fn cmd_mv(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let src = arg(a, 1, "")?; + let dst = arg(a, 2, "")?; + open_writer(file)?.mv(src, dst).map_err(|e| e.to_string()) +} + +fn cmd_rm(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let path = arg(a, 1, "")?; + open_writer(file)?.rm(path).map_err(|e| e.to_string()) +} + +fn cmd_ls(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let base = a.get(1).map(|s| s.as_str()).unwrap_or(""); + let mut r = open_reader(file)?; + let tree = r.tree().map_err(|e| e.to_string())?; + let start = pfs_ms::resolve_path(&tree, base).map_err(|e| e.to_string())?; + print_tree(&tree, start, 0); + Ok(()) +} + +fn print_tree(tree: &Tree, id: [u8; 16], depth: usize) { + if let Some(rec) = tree.nodes.get(&id) { + if id == ROOT_NODE_ID { + println!("/"); + } else { + let suffix = if rec.is_dir() { "/" } else { "" }; + println!("{}{}{}", " ".repeat(depth), rec.name_str(), suffix); + } + } + if let Some(kids) = tree.children.get(&id) { + for &k in kids { + print_tree(tree, k, depth + 1); + } + } +} + +fn cmd_cat(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let path = arg(a, 1, "")?; + let data = open_reader(file)? + .read_path(path) + .map_err(|e| e.to_string())?; + std::io::stdout() + .write_all(&data) + .map_err(|e| e.to_string()) +} + +fn cmd_get(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let path = arg(a, 1, "")?; + let out = arg(a, 2, "")?; + let data = open_reader(file)? + .read_path(path) + .map_err(|e| e.to_string())?; + std::fs::write(out, &data).map_err(|e| format!("cannot write '{out}': {e}")) +} + +fn cmd_log(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + let sessions = open_reader(file)? + .list_sessions() + .map_err(|e| e.to_string())?; + for s in &sessions { + let writer = String::from_utf8_lossy(&s.writer); + println!( + "seq {:<4} blocks {:<3} changes {:<4} ts {:<14} writer {}", + s.session_seq, s.block_count, s.change_count, s.timestamp_unix_ms, writer + ); + } + Ok(()) +} + +fn cmd_verify(a: &[String]) -> CliResult { + let file = arg(a, 0, "")?; + open_reader(file)?.verify().map_err(|e| e.to_string())?; + println!("ok"); + Ok(()) +} + +fn cmd_create(a: &[String]) -> CliResult { + let p = parse_flags(a, &[])?; + let archive = p.positional.first().ok_or("missing argument: ")?; + let dir = p.positional.get(1).ok_or("missing argument: ")?; + let opts = SyncOptions { + compress: !p.flags.contains("store"), + metadata: !p.flags.contains("no-metadata"), + delete: false, + }; + pfs_ms::create_archive(Path::new(archive), Path::new(dir), &opts).map_err(|e| e.to_string()) +} + +fn cmd_update(a: &[String]) -> CliResult { + let p = parse_flags(a, &[])?; + let archive = p.positional.first().ok_or("missing argument: ")?; + let dir = p.positional.get(1).ok_or("missing argument: ")?; + let opts = SyncOptions { + compress: !p.flags.contains("store"), + metadata: !p.flags.contains("no-metadata"), + delete: p.flags.contains("delete"), + }; + pfs_ms::update_archive(Path::new(archive), Path::new(dir), &opts).map_err(|e| e.to_string()) +} + +fn cmd_extract(a: &[String]) -> CliResult { + let p = parse_flags(a, &["at", "at-time"])?; + let archive = p.positional.first().ok_or("missing argument: ")?; + let dir = p.positional.get(1).ok_or("missing argument: ")?; + let metadata = !p.flags.contains("no-metadata"); + + let at: Option = if let Some(seq) = p.values.get("at") { + Some( + seq.parse() + .map_err(|_| format!("invalid --at value '{seq}'"))?, + ) + } else if let Some(ms) = p.values.get("at-time") { + let ms: u64 = ms + .parse() + .map_err(|_| format!("invalid --at-time value '{ms}'"))?; + Some(pfs_ms::session_at_time(Path::new(archive), ms).map_err(|e| e.to_string())?) + } else { + None + }; + + pfs_ms::extract_archive(Path::new(archive), Path::new(dir), at, metadata) + .map_err(|e| e.to_string()) +} diff --git a/reference/PFS-MS-v1.0/src/compress.rs b/reference/PFS-MS-v1.0/src/compress.rs new file mode 100644 index 0000000..f44d3bf --- /dev/null +++ b/reference/PFS-MS-v1.0/src/compress.rs @@ -0,0 +1,72 @@ +//! Content compression (Section 9.4). DEFLATE (RFC 1951, `compression_algo_id = +//! 1`) is the required default and is implemented via the pure-Rust +//! `flate2`/`miniz_oxide` backend. +//! +//! Only the bytes stored in a RAW content partition are compressed: the DIRECT +//! full content, or the DELTA patch. The PCF `data_hash` protects the stored +//! (compressed) bytes; the Node Record's `full_hash`/`full_size` protect the +//! reconstructed (decompressed) content. + +use std::io::{Read, Write}; + +use flate2::read::DeflateDecoder; +use flate2::write::DeflateEncoder; +use flate2::Compression; + +use crate::consts::{COMPRESS_DEFLATE, COMPRESS_NONE}; +use crate::error::{Error, Result}; + +/// DEFLATE-compress `data`. A fixed compression level keeps the output +/// deterministic so byte-exact test vectors are reproducible. +pub fn compress_deflate(data: &[u8]) -> Result> { + let mut enc = DeflateEncoder::new(Vec::new(), Compression::new(6)); + enc.write_all(data) + .map_err(|e| Error::Compression(format!("deflate: {e}")))?; + enc.finish() + .map_err(|e| Error::Compression(format!("deflate finish: {e}"))) +} + +/// Decompress `data` according to `compression_algo_id`. +/// +/// An unimplemented id yields [`Error::UnsupportedCompressionAlgo`] so the +/// caller can report the affected file as unreadable without treating the +/// container as malformed (Section 9.4). +pub fn decompress(compression_algo_id: u8, data: &[u8]) -> Result> { + match compression_algo_id { + COMPRESS_NONE => Ok(data.to_vec()), + COMPRESS_DEFLATE => { + let mut out = Vec::new(); + DeflateDecoder::new(data) + .read_to_end(&mut out) + .map_err(|e| Error::Compression(format!("inflate: {e}")))?; + Ok(out) + } + other => Err(Error::UnsupportedCompressionAlgo(other)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn deflate_roundtrips() { + let data = b"the quick brown fox".repeat(50); + let packed = compress_deflate(&data).unwrap(); + assert!(packed.len() < data.len(), "repetitive input should shrink"); + assert_eq!(decompress(COMPRESS_DEFLATE, &packed).unwrap(), data); + } + + #[test] + fn none_is_verbatim() { + assert_eq!(decompress(COMPRESS_NONE, b"abc").unwrap(), b"abc"); + } + + #[test] + fn unknown_algo_is_reported() { + assert!(matches!( + decompress(2, b"x"), + Err(Error::UnsupportedCompressionAlgo(2)) + )); + } +} diff --git a/reference/PFS-MS-v1.0/src/consts.rs b/reference/PFS-MS-v1.0/src/consts.rs new file mode 100644 index 0000000..f90cf3f --- /dev/null +++ b/reference/PFS-MS-v1.0/src/consts.rs @@ -0,0 +1,77 @@ +//! On-disk constants defined by PFS-MS v1.0. +//! +//! Every value here is normative and corresponds directly to a figure in the +//! specification (`specs/PFS-MS-spec-v1.0.txt`, Section 5, Section 7, Section 8 +//! and Appendix B). + +/// PCF partition type for a Node Record (Section 7). +pub const PFS_NODE_TYPE: u32 = 0xAAAA_0001; +/// PCF partition type for a Session Record (Section 8). +pub const PFS_SESSION_TYPE: u32 = 0xAAAA_0002; +/// PCF RAW/BLOB type used for file content (full bytes or a delta patch). +pub const RAW_TYPE: u32 = pcf::TYPE_RAW; + +/// Node Record magic, `"PFSN"`. +pub const NODE_MAGIC: [u8; 4] = *b"PFSN"; +/// Session Record magic, `"PFSS"`. +pub const SESSION_MAGIC: [u8; 4] = *b"PFSS"; + +/// `record_version` of a Node Record in this profile version. +pub const NODE_RECORD_VERSION: u8 = 1; +/// Profile major version. +pub const PROFILE_VERSION_MAJOR: u8 = 1; +/// Profile minor version. +pub const PROFILE_VERSION_MINOR: u8 = 0; + +/// Node kind: a file. +pub const KIND_FILE: u8 = 1; +/// Node kind: a directory. +pub const KIND_DIR: u8 = 2; + +/// Node flag bit 0: the node is deleted as of this session. +pub const FLAG_TOMBSTONE: u16 = 0x0001; +/// Mask of all defined node-flag bits (others MUST be 0). +pub const FLAG_DEFINED_MASK: u16 = FLAG_TOMBSTONE; + +/// `content_kind`: the empty byte string. +pub const CONTENT_EMPTY: u8 = 0; +/// `content_kind`: full bytes in one RAW partition. +pub const CONTENT_DIRECT: u8 = 1; +/// `content_kind`: a patch against the previous version. +pub const CONTENT_DELTA: u8 = 2; +/// `content_kind`: identical bytes to the previous version. +pub const CONTENT_INHERIT: u8 = 3; + +/// `patch_algo_id`: VCDIFF (RFC 3284), the required default. +pub const PATCH_VCDIFF: u8 = 1; + +/// `compression_algo_id`: stored verbatim (no compression). +pub const COMPRESS_NONE: u8 = 0; +/// `compression_algo_id`: DEFLATE (RFC 1951), the required default. +/// Identifiers 2 (zstd) and 3 (brotli) are reserved for a future revision. +pub const COMPRESS_DEFLATE: u8 = 1; + +/// The reserved root `node_id` (16 zero bytes). +pub const ROOT_NODE_ID: [u8; 16] = [0u8; 16]; + +/// Maximum UTF-8 byte length of a node name (`PFS_MAX_NAME`). +pub const PFS_MAX_NAME: usize = 1024; + +/// Fixed prefix length of a Node Record, in bytes (Section 7.1). +pub const NODE_PREFIX_LEN: usize = 54; +/// Length of a DIRECT content section, in bytes (Section 7.3). Includes the +/// `compression_algo_id` byte. +pub const DIRECT_SECTION_LEN: usize = 91; +/// Length of a DELTA content section, in bytes (Section 7.3). Includes the +/// `compression_algo_id` byte. +pub const DELTA_SECTION_LEN: usize = 165; +/// Fixed prefix length of a Session Record (before the writer field). +pub const SESSION_PREFIX_LEN: usize = 162; + +/// Writer re-baseline threshold (`PFS_RECOMMENDED_MAX_DELTA_DEPTH`). +pub const RECOMMENDED_MAX_DELTA_DEPTH: usize = 16; +/// Minimum delta depth a reader must support (`PFS_MIN_READER_DELTA_DEPTH`). +pub const MIN_READER_DELTA_DEPTH: usize = 64; + +/// Width of every hash field (matches PCF's 64-byte fields). +pub const HASH_FIELD_SIZE: usize = pcf::HASH_FIELD_SIZE; diff --git a/reference/PFS-MS-v1.0/src/delta.rs b/reference/PFS-MS-v1.0/src/delta.rs new file mode 100644 index 0000000..7ed420a --- /dev/null +++ b/reference/PFS-MS-v1.0/src/delta.rs @@ -0,0 +1,50 @@ +//! Delta encoding (Section 9.2). VCDIFF (RFC 3284, `patch_algo_id = 1`) is the +//! required default and is implemented via the pure-Rust `oxidelta` crate. + +use oxidelta::compress::{decoder, encoder}; + +use crate::consts::PATCH_VCDIFF; +use crate::error::{Error, Result}; + +/// Produce a VCDIFF patch transforming `base` into `target`. +pub fn diff_vcdiff(base: &[u8], target: &[u8]) -> Result> { + let mut out = Vec::new(); + encoder::encode_all(&mut out, base, target, encoder::CompressOptions::default()) + .map_err(|e| Error::Vcdiff(format!("encode: {e}")))?; + Ok(out) +} + +/// Apply a patch of algorithm `patch_algo` to `base`, returning the result. +/// +/// An unimplemented `patch_algo` yields [`Error::UnsupportedPatchAlgo`] so the +/// caller can report the affected file as unreadable without treating the +/// container as malformed (Section 9.2). +pub fn apply(patch_algo: u8, base: &[u8], patch: &[u8]) -> Result> { + match patch_algo { + PATCH_VCDIFF => { + decoder::decode_all(base, patch).map_err(|e| Error::Vcdiff(format!("decode: {e}"))) + } + other => Err(Error::UnsupportedPatchAlgo(other)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn vcdiff_roundtrips() { + let base = b"Hello\n"; + let target = b"Hello, world\n"; + let patch = diff_vcdiff(base, target).unwrap(); + assert_eq!(apply(PATCH_VCDIFF, base, &patch).unwrap(), target); + } + + #[test] + fn unknown_algo_is_reported() { + assert!(matches!( + apply(2, b"a", b"b"), + Err(Error::UnsupportedPatchAlgo(2)) + )); + } +} diff --git a/reference/PFS-MS-v1.0/src/dirsync.rs b/reference/PFS-MS-v1.0/src/dirsync.rs new file mode 100644 index 0000000..2ee0301 --- /dev/null +++ b/reference/PFS-MS-v1.0/src/dirsync.rs @@ -0,0 +1,307 @@ +//! Directory <-> archive tooling: build an archive from a host directory, +//! update it from a directory, and extract it back to a directory. +//! +//! This is the only module that touches the host filesystem. Each `create` or +//! `update` is committed as a SINGLE session via [`FsWriter::commit_changes`] +//! (one "burn"), and `extract` can reconstruct any point in history. Symlinks +//! and other non-regular files are skipped with a warning; only regular files +//! and directories are imported. + +use std::collections::HashSet; +use std::fs::{self, File, OpenOptions}; +use std::path::Path; +use std::time::UNIX_EPOCH; + +use pcf::HashAlgo; + +use crate::error::{Error, Result}; +use crate::fs::FsReader; +use crate::tree::Tree; +use crate::writer::{Change, FsWriter}; +use crate::ROOT_NODE_ID; + +/// Options for [`create_archive`] / [`update_archive`]. +#[derive(Debug, Clone, Copy)] +pub struct SyncOptions { + /// Compress file content with DEFLATE when smaller (Section 9.5). + pub compress: bool, + /// Capture POSIX mode + mtime from the source into the archive. + pub metadata: bool, + /// (update only) Tombstone archive entries absent from the source (mirror). + pub delete: bool, +} + +impl Default for SyncOptions { + fn default() -> Self { + SyncOptions { + compress: true, + metadata: true, + delete: false, + } + } +} + +// ---- metadata capture / restore ----------------------------------------- + +#[cfg(unix)] +fn mode_of(meta: &fs::Metadata) -> u32 { + use std::os::unix::fs::PermissionsExt; + meta.permissions().mode() & 0o7777 +} +#[cfg(not(unix))] +fn mode_of(_meta: &fs::Metadata) -> u32 { + 0 +} + +fn mtime_ms_of(meta: &fs::Metadata) -> u64 { + meta.modified() + .ok() + .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + +#[cfg(unix)] +fn restore_mode(path: &Path, mode: u32) { + use std::os::unix::fs::PermissionsExt; + if mode != 0 { + let _ = fs::set_permissions(path, fs::Permissions::from_mode(mode)); + } +} +#[cfg(not(unix))] +fn restore_mode(_path: &Path, _mode: u32) {} + +fn restore_mtime(path: &Path, mtime_ms: u64) { + if mtime_ms != 0 { + let secs = (mtime_ms / 1000) as i64; + let nanos = ((mtime_ms % 1000) * 1_000_000) as u32; + let _ = filetime::set_file_mtime(path, filetime::FileTime::from_unix_time(secs, nanos)); + } +} + +// ---- walking the source tree -------------------------------------------- + +fn collect_changes(src: &Path, opts: &SyncOptions) -> Result> { + let mut out = Vec::new(); + walk(src, "", opts, &mut out)?; + Ok(out) +} + +fn walk(dir: &Path, prefix: &str, opts: &SyncOptions, out: &mut Vec) -> Result<()> { + let mut entries: Vec<_> = fs::read_dir(dir)?.collect::>>()?; + entries.sort_by_key(|e| e.file_name()); + for e in entries { + let path = e.path(); + let ft = fs::symlink_metadata(&path)?.file_type(); + let name = e.file_name().to_string_lossy().into_owned(); + let rel = if prefix.is_empty() { + name + } else { + format!("{prefix}/{name}") + }; + if ft.is_symlink() { + eprintln!("pfs: skipping symlink {}", path.display()); + continue; + } + if ft.is_dir() { + let meta = fs::metadata(&path)?; + let (mode, mtime) = if opts.metadata { + (mode_of(&meta), mtime_ms_of(&meta)) + } else { + (0, 0) + }; + out.push(Change::Mkdir { + path: rel.clone(), + mode, + mtime_unix_ms: mtime, + }); + walk(&path, &rel, opts, out)?; + } else if ft.is_file() { + let meta = fs::metadata(&path)?; + let (mode, mtime) = if opts.metadata { + (mode_of(&meta), mtime_ms_of(&meta)) + } else { + (0, 0) + }; + let content = fs::read(&path)?; + out.push(Change::PutFile { + path: rel, + content, + mode, + mtime_unix_ms: mtime, + }); + } else { + eprintln!("pfs: skipping special file {}", path.display()); + } + } + Ok(()) +} + +fn open_rw(archive: &Path) -> Result { + OpenOptions::new() + .read(true) + .write(true) + .open(archive) + .map_err(Error::Io) +} + +// ---- public operations --------------------------------------------------- + +/// Create a brand-new archive from the contents of `src`. Fails if `archive` +/// already exists. The root directory is session 1; the imported tree is +/// session 2 (a single session regardless of file count). +pub fn create_archive(archive: &Path, src: &Path, opts: &SyncOptions) -> Result<()> { + if !fs::metadata(src)?.is_dir() { + return Err(Error::NotADirectory); + } + let changes = collect_changes(src, opts)?; + let file = OpenOptions::new() + .read(true) + .write(true) + .create_new(true) + .open(archive) + .map_err(|e| { + if e.kind() == std::io::ErrorKind::AlreadyExists { + Error::AlreadyExists + } else { + Error::Io(e) + } + })?; + let mut w = FsWriter::mkfs(file, HashAlgo::Sha256)?; + w.set_writer_id(b"pfs-create"); + w.set_compression(opts.compress); + w.commit_changes(&changes)?; + Ok(()) +} + +/// Update an existing archive from `src`: add new files, update changed ones, +/// and (when `opts.delete`) tombstone live archive entries absent from `src`. +/// All of it is one session. +pub fn update_archive(archive: &Path, src: &Path, opts: &SyncOptions) -> Result<()> { + if !fs::metadata(src)?.is_dir() { + return Err(Error::NotADirectory); + } + let mut changes = collect_changes(src, opts)?; + + if opts.delete { + // Paths present in the source (normalised, '/'-separated). + let source: HashSet = changes + .iter() + .map(|c| match c { + Change::Mkdir { path, .. } => path.clone(), + Change::PutFile { path, .. } => path.clone(), + Change::Remove { path } => path.clone(), + }) + .collect(); + // Live archive paths; tombstone any not in the source. + let live = { + let mut r = FsReader::open(open_rw(archive)?)?; + let tree = r.tree()?; + live_paths(&tree) + }; + for p in live { + if !source.contains(&p) { + changes.push(Change::Remove { path: p }); + } + } + } + + let mut w = FsWriter::open(open_rw(archive)?)?; + w.set_writer_id(b"pfs-update"); + w.set_compression(opts.compress); + w.commit_changes(&changes)?; + Ok(()) +} + +/// Extract the archive's tree (optionally as of session `at`) into `dst`, +/// restoring mode + mtime when `metadata` is true. +pub fn extract_archive(archive: &Path, dst: &Path, at: Option, metadata: bool) -> Result<()> { + let mut r = FsReader::open(open_rw(archive)?)?; + let tree = r.tree_as_of(at)?; + fs::create_dir_all(dst)?; + extract_dir(&mut r, &tree, ROOT_NODE_ID, dst, "", at, metadata)?; + Ok(()) +} + +/// Resolve a unix-millisecond timestamp to the newest session_seq committed at +/// or before it (0 if none), for `extract --at-time`. +pub fn session_at_time(archive: &Path, unix_ms: u64) -> Result { + let mut r = FsReader::open(open_rw(archive)?)?; + let mut best = 0u64; + for s in r.list_sessions()? { + if s.timestamp_unix_ms <= unix_ms && s.session_seq > best { + best = s.session_seq; + } + } + Ok(best) +} + +// ---- helpers -------------------------------------------------------------- + +/// All live node paths in the tree (directories and files), root excluded. +fn live_paths(tree: &Tree) -> Vec { + let mut out = Vec::new(); + collect_paths(tree, ROOT_NODE_ID, "", &mut out); + out +} + +fn collect_paths(tree: &Tree, node: [u8; 16], prefix: &str, out: &mut Vec) { + if let Some(kids) = tree.children.get(&node) { + for &cid in kids { + if let Some(rec) = tree.nodes.get(&cid) { + let name = rec.name_str(); + let rel = if prefix.is_empty() { + name + } else { + format!("{prefix}/{name}") + }; + out.push(rel.clone()); + if rec.is_dir() { + collect_paths(tree, cid, &rel, out); + } + } + } + } +} + +fn extract_dir( + r: &mut FsReader, + tree: &Tree, + node: [u8; 16], + host_dir: &Path, + prefix: &str, + at: Option, + metadata: bool, +) -> Result<()> { + let kids = match tree.children.get(&node) { + Some(k) => k.clone(), + None => return Ok(()), + }; + for cid in kids { + let rec = tree.nodes.get(&cid).ok_or(Error::NotFound)?.clone(); + let name = rec.name_str(); + let host = host_dir.join(&name); + let rel = if prefix.is_empty() { + name + } else { + format!("{prefix}/{name}") + }; + if rec.is_dir() { + fs::create_dir_all(&host)?; + extract_dir(r, tree, cid, &host, &rel, at, metadata)?; + // Restore directory metadata AFTER its children are written. + if metadata { + restore_mode(&host, rec.mode); + restore_mtime(&host, rec.mtime_unix_ms); + } + } else { + let content = r.read_path_as_of(&rel, at)?; + fs::write(&host, &content)?; + if metadata { + restore_mode(&host, rec.mode); + restore_mtime(&host, rec.mtime_unix_ms); + } + } + } + Ok(()) +} diff --git a/reference/PFS-MS-v1.0/src/error.rs b/reference/PFS-MS-v1.0/src/error.rs new file mode 100644 index 0000000..e02a571 --- /dev/null +++ b/reference/PFS-MS-v1.0/src/error.rs @@ -0,0 +1,109 @@ +//! Error and result types for the PFS-MS reference implementation. + +use std::fmt; + +/// Result alias used throughout the crate. +pub type Result = std::result::Result; + +/// Everything that can go wrong reading, writing, or reconstructing a PFS-MS +/// filesystem. Container-level failures are wrapped from [`pcf::Error`]. +#[derive(Debug)] +pub enum Error { + /// An underlying I/O failure. + Io(std::io::Error), + /// A PCF container-level error (bad magic, hash mismatch, …). + Pcf(pcf::Error), + + /// A Node Record was structurally invalid (bad magic/version/kind, a + /// reserved flag bit set, an out-of-range or illegal name, a truncated + /// content section, …) — spec R4. + MalformedNode(&'static str), + /// A Session Record was structurally invalid — spec R3. + MalformedSession(&'static str), + + /// The backward-linked session chain was inconsistent: a HEAD block lacked + /// its single PFS_SESSION partition, a MEMBER block carried one, the + /// session_seq order was not strictly decreasing, or block_count did not + /// match the chain — spec R3. + BrokenChain(&'static str), + /// The inter-session hash chain failed verification (a table_hash, + /// member_blocks_digest, or prev_session_hash mismatch) — spec R8. + ChainHashMismatch, + + /// The same node_id appeared twice within one session — spec R5. + DuplicateNodeInSession, + /// A liveness walk to the root encountered a cycle — spec R6. + ParentCycle, + + /// A referenced RAW content/patch partition was missing from the file. + MissingContent, + /// A reconstructed file failed its full_hash or base_full_hash check, or a + /// RAW partition failed its PCF data_hash — spec R7. + ContentHashMismatch, + /// A DELTA/INHERIT base could not be resolved (history is malformed). + MissingBase, + /// A delta used an unimplemented patch_algo_id; the affected file is + /// unreadable but the container is not malformed on that basis (Section 9.2). + UnsupportedPatchAlgo(u8), + /// A file's delta chain exceeded the reader's supported depth. + DeltaTooDeep, + /// VCDIFF encode/decode failed. + Vcdiff(String), + /// Content used an unimplemented compression_algo_id; the affected file is + /// unreadable but the container is not malformed on that basis (Section 9.4). + UnsupportedCompressionAlgo(u8), + /// DEFLATE compression/decompression failed. + Compression(String), + + /// A requested path did not resolve to a live node. + NotFound, + /// A path component was not a directory. + NotADirectory, + /// The target already exists where a fresh node was required. + AlreadyExists, + /// An operation supplied an invalid path or name. + InvalidPath(&'static str), +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Error::Io(e) => write!(f, "io error: {e}"), + Error::Pcf(e) => write!(f, "pcf error: {e}"), + Error::MalformedNode(m) => write!(f, "malformed node record: {m}"), + Error::MalformedSession(m) => write!(f, "malformed session record: {m}"), + Error::BrokenChain(m) => write!(f, "broken session chain: {m}"), + Error::ChainHashMismatch => write!(f, "inter-session hash chain mismatch"), + Error::DuplicateNodeInSession => write!(f, "node_id appears twice in one session"), + Error::ParentCycle => write!(f, "cycle in parent hierarchy"), + Error::MissingContent => write!(f, "referenced content partition is missing"), + Error::ContentHashMismatch => write!(f, "file content hash mismatch"), + Error::MissingBase => write!(f, "delta/inherit base is missing"), + Error::UnsupportedPatchAlgo(id) => write!(f, "unsupported patch_algo_id {id}"), + Error::DeltaTooDeep => write!(f, "delta chain too deep"), + Error::Vcdiff(m) => write!(f, "vcdiff error: {m}"), + Error::UnsupportedCompressionAlgo(id) => { + write!(f, "unsupported compression_algo_id {id}") + } + Error::Compression(m) => write!(f, "compression error: {m}"), + Error::NotFound => write!(f, "path not found"), + Error::NotADirectory => write!(f, "not a directory"), + Error::AlreadyExists => write!(f, "already exists"), + Error::InvalidPath(m) => write!(f, "invalid path: {m}"), + } + } +} + +impl std::error::Error for Error {} + +impl From for Error { + fn from(e: std::io::Error) -> Self { + Error::Io(e) + } +} + +impl From for Error { + fn from(e: pcf::Error) -> Self { + Error::Pcf(e) + } +} diff --git a/reference/PFS-MS-v1.0/src/fs.rs b/reference/PFS-MS-v1.0/src/fs.rs new file mode 100644 index 0000000..52db16c --- /dev/null +++ b/reference/PFS-MS-v1.0/src/fs.rs @@ -0,0 +1,89 @@ +//! The high-level [`FsReader`]: open a PFS-MS file and query the reconstructed +//! filesystem (Sections 11, 13). + +use std::io::{Read, Seek, Write}; + +use pcf::Container; + +use crate::error::{Error, Result}; +use crate::reader::{build_node_view, scan, verify_chain, NodeView, Scan}; +use crate::session::SessionRecord; +use crate::tree::{build_tree, read_file, resolve_path, Tree}; + +/// A read-only view over a PFS-MS file, backed by a PCF [`Container`]. +pub struct FsReader { + container: Container, +} + +impl FsReader { + /// Open a PFS-MS file (validates the PCF header, spec R1). + pub fn open(storage: S) -> Result { + Ok(Self { + container: Container::open(storage)?, + }) + } + + /// Consume the reader and return the backing store. + pub fn into_storage(self) -> S { + self.container.into_storage() + } + + /// Scan the backward-linked session chain (spec R2, R3). + pub fn scan(&mut self) -> Result { + scan(&mut self.container) + } + + /// Full integrity check: PCF table/data hashes (R1, R7), the inter-session + /// hash chain (R8), and node-view consistency including cycle and + /// duplicate-node detection (R5, R6). + pub fn verify(&mut self) -> Result<()> { + self.container.verify()?; + let scan = scan(&mut self.container)?; + verify_chain(&scan)?; + let view = build_node_view(&scan, None); + build_tree(&view)?; + Ok(()) + } + + /// The resolved node view at the head (or "as of" `max_seq`). + pub fn node_view(&mut self, max_seq: Option) -> Result { + let scan = scan(&mut self.container)?; + Ok(build_node_view(&scan, max_seq)) + } + + /// The live directory tree at the head. + pub fn tree(&mut self) -> Result { + self.tree_as_of(None) + } + + /// The live directory tree as of `max_seq` (history query, Section 15). + pub fn tree_as_of(&mut self, max_seq: Option) -> Result { + let scan = scan(&mut self.container)?; + let view = build_node_view(&scan, max_seq); + build_tree(&view) + } + + /// Read a file's content at the head. + pub fn read_path(&mut self, path: &str) -> Result> { + self.read_path_as_of(path, None) + } + + /// Read a file's content as of `max_seq` (history query, Section 15). + pub fn read_path_as_of(&mut self, path: &str, max_seq: Option) -> Result> { + let scan = scan(&mut self.container)?; + let view = build_node_view(&scan, max_seq); + let tree = build_tree(&view)?; + let id = resolve_path(&tree, path)?; + let rec = tree.nodes.get(&id).ok_or(Error::NotFound)?; + if !rec.is_file() { + return Err(Error::NotADirectory); + } + read_file(&mut self.container, &scan, &view, id) + } + + /// All session records, newest first. + pub fn list_sessions(&mut self) -> Result> { + let scan = scan(&mut self.container)?; + Ok(scan.sessions.into_iter().map(|s| s.record).collect()) + } +} diff --git a/reference/PFS-MS-v1.0/src/lib.rs b/reference/PFS-MS-v1.0/src/lib.rs new file mode 100644 index 0000000..83eeada --- /dev/null +++ b/reference/PFS-MS-v1.0/src/lib.rs @@ -0,0 +1,65 @@ +//! # `pfs-ms` — PFS-MS v1.0 reference implementation +//! +//! PFS-MS (PCF File System, Multi-Session Profile) stores an append-only, +//! multi-session tree of files and directories inside a single **PCF v1.0** +//! file. It is layered *strictly above* PCF: a PFS-MS file is a fully +//! conforming PCF file (a generic PCF reader sees a valid flat set of +//! partitions), and this crate builds entirely on the [`pcf`] reference crate. +//! +//! Three kinds of PCF partition carry the profile: +//! +//! * **RAW** (`0xFFFFFFFF`) — file content: full bytes or a VCDIFF patch. +//! * **PFS_NODE** (`0xAAAA0001`) — one [`NodeRecord`] per changed node. +//! * **PFS_SESSION** (`0xAAAA0002`) — one [`SessionRecord`] per session. +//! +//! Sessions are committed by appending **backward-linked** Table Blocks +//! (newest → oldest via `next_table_offset`) and atomically rewriting the +//! 8-byte header pointer — the sole in-place mutation (Section 4.3). +//! +//! ## Example +//! +//! ``` +//! use std::io::Cursor; +//! use pcf::HashAlgo; +//! use pfs_ms::{FsReader, FsWriter}; +//! +//! // Create a filesystem and commit three sessions. +//! let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); +//! w.mkdir("docs").unwrap(); +//! w.put_file("docs/hello.txt", b"Hello\n").unwrap(); +//! w.put_file("docs/hello.txt", b"Hello, world\n").unwrap(); +//! let bytes = w.into_storage().into_inner(); +//! +//! // Read it back. +//! let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); +//! r.verify().unwrap(); +//! assert_eq!(r.read_path("docs/hello.txt").unwrap(), b"Hello, world\n"); +//! ``` + +mod compress; +pub mod consts; +mod delta; +mod dirsync; +mod error; +mod fs; +mod node; +mod reader; +mod session; +mod tree; +mod vector; +mod writer; + +pub use compress::{compress_deflate, decompress}; +pub use consts::*; +pub use dirsync::{create_archive, extract_archive, session_at_time, update_archive, SyncOptions}; +pub use error::{Error, Result}; +pub use fs::FsReader; +pub use node::{ContentSection, NodeRecord}; +pub use reader::{build_node_view, scan, verify_chain, NodeView, Scan, SessionView}; +pub use session::{member_blocks_digest, SessionRecord}; +pub use tree::{build_tree, current_delta_depth, is_live, read_file, resolve_path, Tree}; +pub use vector::build_reference_vector; +pub use writer::{new_id, Change, FsWriter, Partition}; + +// Re-export the underlying hash registry for convenience. +pub use pcf::HashAlgo; diff --git a/reference/PFS-MS-v1.0/src/node.rs b/reference/PFS-MS-v1.0/src/node.rs new file mode 100644 index 0000000..30ad0ab --- /dev/null +++ b/reference/PFS-MS-v1.0/src/node.rs @@ -0,0 +1,431 @@ +//! The Node Record stored as the data of a `PFS_NODE` partition (Section 7). +//! +//! A record is a fixed 54-byte prefix, a variable-length UTF-8 name, and — for +//! live files only — a content section (Section 7.3). The byte layout mirrors +//! Appendix A exactly. + +use pcf::HashAlgo; + +use crate::consts::*; +use crate::error::{Error, Result}; + +/// The content section of a live file's Node Record (Section 7.3). +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ContentSection { + /// `content_kind = 0`: the empty byte string. + Empty, + /// `content_kind = 1`: full bytes in one RAW partition. + Direct { + /// Compression of the stored RAW bytes (0 = none, 1 = DEFLATE). + compression_algo: u8, + /// PCF uid of the RAW partition holding the (possibly compressed) content. + content_uid: [u8; 16], + /// Length of the reconstructed (decompressed) content. + full_size: u64, + /// Hash algorithm of `full_hash`. + full_hash_algo: HashAlgo, + /// Hash of the full (decompressed) content. + full_hash: [u8; HASH_FIELD_SIZE], + }, + /// `content_kind = 2`: a patch against the previous content-bearing version. + Delta { + /// Patch algorithm (1 = VCDIFF). + patch_algo: u8, + /// Compression of the stored RAW patch bytes (0 = none, 1 = DEFLATE). + compression_algo: u8, + /// PCF uid of the RAW partition holding the (possibly compressed) patch. + patch_uid: [u8; 16], + /// Length of the reconstructed content. + full_size: u64, + /// Hash algorithm of `full_hash`. + full_hash_algo: HashAlgo, + /// Hash of the reconstructed content. + full_hash: [u8; HASH_FIELD_SIZE], + /// Length of the base (previous version). + base_full_size: u64, + /// Hash algorithm of `base_full_hash`. + base_full_hash_algo: HashAlgo, + /// Hash of the base. + base_full_hash: [u8; HASH_FIELD_SIZE], + }, + /// `content_kind = 3`: identical bytes to the previous version. + Inherit, +} + +/// A parsed Node Record. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct NodeRecord { + /// `1` = file, `2` = directory. + pub kind: u8, + /// Node flags (bit 0 = TOMBSTONE). + pub flags: u16, + /// Stable 16-byte node identity (all-zero only for the root). + pub node_id: [u8; 16], + /// node_id of the containing directory (equals node_id for the root). + pub parent_id: [u8; 16], + /// Optional modification time (0 = unspecified). + pub mtime_unix_ms: u64, + /// Optional POSIX permission bits (0 = unset). + pub mode: u32, + /// The node's UTF-8 name within its parent (empty for the root). + pub name: Vec, + /// Content section, present iff `kind == file` and not tombstoned. + pub content: Option, +} + +fn rd_u16(b: &[u8]) -> u16 { + u16::from_le_bytes([b[0], b[1]]) +} +fn rd_u32(b: &[u8]) -> u32 { + u32::from_le_bytes([b[0], b[1], b[2], b[3]]) +} +fn rd_u64(b: &[u8]) -> u64 { + u64::from_le_bytes(b[0..8].try_into().unwrap()) +} + +impl NodeRecord { + /// True if the TOMBSTONE flag is set. + pub fn is_tombstone(&self) -> bool { + self.flags & FLAG_TOMBSTONE != 0 + } + /// True if this record describes a file. + pub fn is_file(&self) -> bool { + self.kind == KIND_FILE + } + /// True if this record describes a directory. + pub fn is_dir(&self) -> bool { + self.kind == KIND_DIR + } + /// The name as a UTF-8 string (lossless; names are validated on parse). + pub fn name_str(&self) -> String { + String::from_utf8_lossy(&self.name).into_owned() + } + + /// Validate a name per Section 7.2 (no NUL or '/', not "." or ".."). + fn validate_name(name: &[u8]) -> Result<()> { + if name.len() > PFS_MAX_NAME { + return Err(Error::MalformedNode("name_len out of range")); + } + if name.contains(&0x00) || name.contains(&b'/') { + return Err(Error::MalformedNode("name contains NUL or '/'")); + } + if name == b"." || name == b".." { + return Err(Error::MalformedNode("name is '.' or '..'")); + } + Ok(()) + } + + /// Serialise to the on-disk Node Record layout (Section 7, Appendix A). + pub fn to_bytes(&self) -> Vec { + let mut b = Vec::with_capacity(NODE_PREFIX_LEN + self.name.len()); + b.extend_from_slice(&NODE_MAGIC); + b.push(NODE_RECORD_VERSION); + b.push(self.kind); + b.extend_from_slice(&self.flags.to_le_bytes()); + b.extend_from_slice(&self.node_id); + b.extend_from_slice(&self.parent_id); + b.extend_from_slice(&self.mtime_unix_ms.to_le_bytes()); + b.extend_from_slice(&self.mode.to_le_bytes()); + b.extend_from_slice(&(self.name.len() as u16).to_le_bytes()); + b.extend_from_slice(&self.name); + debug_assert_eq!(b.len(), NODE_PREFIX_LEN + self.name.len()); + + if let Some(c) = &self.content { + match c { + ContentSection::Empty => b.push(CONTENT_EMPTY), + ContentSection::Inherit => b.push(CONTENT_INHERIT), + ContentSection::Direct { + compression_algo, + content_uid, + full_size, + full_hash_algo, + full_hash, + } => { + b.push(CONTENT_DIRECT); + b.push(*compression_algo); + b.extend_from_slice(content_uid); + b.extend_from_slice(&full_size.to_le_bytes()); + b.push(full_hash_algo.id()); + b.extend_from_slice(full_hash); + } + ContentSection::Delta { + patch_algo, + compression_algo, + patch_uid, + full_size, + full_hash_algo, + full_hash, + base_full_size, + base_full_hash_algo, + base_full_hash, + } => { + b.push(CONTENT_DELTA); + b.push(*patch_algo); + b.push(*compression_algo); + b.extend_from_slice(patch_uid); + b.extend_from_slice(&full_size.to_le_bytes()); + b.push(full_hash_algo.id()); + b.extend_from_slice(full_hash); + b.extend_from_slice(&base_full_size.to_le_bytes()); + b.push(base_full_hash_algo.id()); + b.extend_from_slice(base_full_hash); + } + } + } + b + } + + /// Parse and validate a Node Record from a partition's data (spec R4). + pub fn from_bytes(b: &[u8]) -> Result { + if b.len() < NODE_PREFIX_LEN { + return Err(Error::MalformedNode("record shorter than fixed prefix")); + } + if b[0..4] != NODE_MAGIC { + return Err(Error::MalformedNode("bad record_magic")); + } + if b[4] != NODE_RECORD_VERSION { + return Err(Error::MalformedNode("unsupported record_version")); + } + let kind = b[5]; + if kind != KIND_FILE && kind != KIND_DIR { + return Err(Error::MalformedNode("unknown kind")); + } + let flags = rd_u16(&b[6..8]); + if flags & !FLAG_DEFINED_MASK != 0 { + return Err(Error::MalformedNode("reserved flag bit set")); + } + let mut node_id = [0u8; 16]; + node_id.copy_from_slice(&b[8..24]); + let mut parent_id = [0u8; 16]; + parent_id.copy_from_slice(&b[24..40]); + let mtime_unix_ms = rd_u64(&b[40..48]); + let mode = rd_u32(&b[48..52]); + let name_len = rd_u16(&b[52..54]) as usize; + if name_len > PFS_MAX_NAME { + return Err(Error::MalformedNode("name_len out of range")); + } + let name_end = NODE_PREFIX_LEN + name_len; + if b.len() < name_end { + return Err(Error::MalformedNode("record truncated within name")); + } + let name = b[NODE_PREFIX_LEN..name_end].to_vec(); + Self::validate_name(&name)?; + + let tombstone = flags & FLAG_TOMBSTONE != 0; + let has_content = kind == KIND_FILE && !tombstone; + let rest = &b[name_end..]; + + let content = if has_content { + Some(Self::parse_content(rest)?) + } else { + // Directories and tombstones end after the name. + if !rest.is_empty() { + return Err(Error::MalformedNode("unexpected trailing bytes")); + } + None + }; + + Ok(NodeRecord { + kind, + flags, + node_id, + parent_id, + mtime_unix_ms, + mode, + name, + content, + }) + } + + fn parse_content(rest: &[u8]) -> Result { + if rest.is_empty() { + return Err(Error::MalformedNode("missing content section")); + } + let kind = rest[0]; + match kind { + CONTENT_EMPTY => { + if rest.len() != 1 { + return Err(Error::MalformedNode("EMPTY section has trailing bytes")); + } + Ok(ContentSection::Empty) + } + CONTENT_INHERIT => { + if rest.len() != 1 { + return Err(Error::MalformedNode("INHERIT section has trailing bytes")); + } + Ok(ContentSection::Inherit) + } + CONTENT_DIRECT => { + if rest.len() != DIRECT_SECTION_LEN { + return Err(Error::MalformedNode("DIRECT section wrong length")); + } + let compression_algo = rest[1]; + let mut content_uid = [0u8; 16]; + content_uid.copy_from_slice(&rest[2..18]); + let full_size = rd_u64(&rest[18..26]); + let full_hash_algo = HashAlgo::from_id(rest[26])?; + let mut full_hash = [0u8; HASH_FIELD_SIZE]; + full_hash.copy_from_slice(&rest[27..91]); + Ok(ContentSection::Direct { + compression_algo, + content_uid, + full_size, + full_hash_algo, + full_hash, + }) + } + CONTENT_DELTA => { + if rest.len() != DELTA_SECTION_LEN { + return Err(Error::MalformedNode("DELTA section wrong length")); + } + let patch_algo = rest[1]; + let compression_algo = rest[2]; + let mut patch_uid = [0u8; 16]; + patch_uid.copy_from_slice(&rest[3..19]); + let full_size = rd_u64(&rest[19..27]); + let full_hash_algo = HashAlgo::from_id(rest[27])?; + let mut full_hash = [0u8; HASH_FIELD_SIZE]; + full_hash.copy_from_slice(&rest[28..92]); + let base_full_size = rd_u64(&rest[92..100]); + let base_full_hash_algo = HashAlgo::from_id(rest[100])?; + let mut base_full_hash = [0u8; HASH_FIELD_SIZE]; + base_full_hash.copy_from_slice(&rest[101..165]); + Ok(ContentSection::Delta { + patch_algo, + compression_algo, + patch_uid, + full_size, + full_hash_algo, + full_hash, + base_full_size, + base_full_hash_algo, + base_full_hash, + }) + } + _ => Err(Error::MalformedNode("unknown content_kind")), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn h(algo: HashAlgo, data: &[u8]) -> [u8; HASH_FIELD_SIZE] { + algo.compute(data) + } + + #[test] + fn dir_roundtrip() { + let r = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: [7u8; 16], + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 123, + mode: 0o755, + name: b"docs".to_vec(), + content: None, + }; + let bytes = r.to_bytes(); + assert_eq!(bytes.len(), NODE_PREFIX_LEN + 4); + assert_eq!(NodeRecord::from_bytes(&bytes).unwrap(), r); + } + + #[test] + fn direct_file_roundtrip() { + let r = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: [9u8; 16], + parent_id: [7u8; 16], + mtime_unix_ms: 0, + mode: 0, + name: b"hello.txt".to_vec(), + content: Some(ContentSection::Direct { + compression_algo: COMPRESS_DEFLATE, + content_uid: [3u8; 16], + full_size: 6, + full_hash_algo: HashAlgo::Sha256, + full_hash: h(HashAlgo::Sha256, b"Hello\n"), + }), + }; + let bytes = r.to_bytes(); + assert_eq!(bytes.len(), NODE_PREFIX_LEN + 9 + DIRECT_SECTION_LEN); + assert_eq!(NodeRecord::from_bytes(&bytes).unwrap(), r); + } + + #[test] + fn delta_file_roundtrip() { + let r = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: [9u8; 16], + parent_id: [7u8; 16], + mtime_unix_ms: 0, + mode: 0, + name: b"hello.txt".to_vec(), + content: Some(ContentSection::Delta { + patch_algo: PATCH_VCDIFF, + compression_algo: COMPRESS_NONE, + patch_uid: [4u8; 16], + full_size: 13, + full_hash_algo: HashAlgo::Sha256, + full_hash: h(HashAlgo::Sha256, b"Hello, world\n"), + base_full_size: 6, + base_full_hash_algo: HashAlgo::Sha256, + base_full_hash: h(HashAlgo::Sha256, b"Hello\n"), + }), + }; + let bytes = r.to_bytes(); + assert_eq!(bytes.len(), NODE_PREFIX_LEN + 9 + DELTA_SECTION_LEN); + assert_eq!(NodeRecord::from_bytes(&bytes).unwrap(), r); + } + + #[test] + fn tombstone_has_no_content() { + let r = NodeRecord { + kind: KIND_FILE, + flags: FLAG_TOMBSTONE, + node_id: [9u8; 16], + parent_id: [7u8; 16], + mtime_unix_ms: 0, + mode: 0, + name: b"gone.txt".to_vec(), + content: None, + }; + let bytes = r.to_bytes(); + let back = NodeRecord::from_bytes(&bytes).unwrap(); + assert!(back.is_tombstone()); + assert!(back.content.is_none()); + } + + #[test] + fn rejects_bad_name_and_flags() { + let base = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: [1u8; 16], + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"ok".to_vec(), + content: None, + }; + let mut slash = base.clone(); + slash.name = b"a/b".to_vec(); + assert!(NodeRecord::from_bytes(&slash.to_bytes()).is_err()); + + let mut dotdot = base.clone(); + dotdot.name = b"..".to_vec(); + assert!(NodeRecord::from_bytes(&dotdot.to_bytes()).is_err()); + + // A reserved flag bit must be rejected on parse. + let mut bytes = base.to_bytes(); + bytes[6] = 0x02; // set a reserved flag bit + assert!(matches!( + NodeRecord::from_bytes(&bytes), + Err(Error::MalformedNode(_)) + )); + } +} diff --git a/reference/PFS-MS-v1.0/src/reader.rs b/reference/PFS-MS-v1.0/src/reader.rs new file mode 100644 index 0000000..b025257 --- /dev/null +++ b/reference/PFS-MS-v1.0/src/reader.rs @@ -0,0 +1,215 @@ +//! Reading a PFS-MS file: walk the backward-linked session chain, group blocks +//! into sessions, verify the inter-session hash chain, and build the node view +//! (Sections 8, 10, 11). +//! +//! The core is a set of generic functions over a [`pcf::Container`], so both the +//! owning [`FsReader`] and the writer's mid-commit state snapshot can share +//! them. Block walking reuses [`pcf::Container::read_block_at`]. + +use std::collections::HashMap; +use std::io::{Read, Seek, Write}; + +use pcf::{Container, HashAlgo, PartitionEntry}; + +use crate::consts::*; +use crate::error::{Error, Result}; +use crate::node::NodeRecord; +use crate::session::{member_blocks_digest, SessionRecord}; + +/// One session as recovered from the chain (newest sessions appear first). +#[derive(Debug, Clone)] +pub struct SessionView { + /// `session_seq` of this session. + pub seq: u64, + /// Absolute offset of this session's HEAD block. + pub head_offset: u64, + /// The parsed Session Record. + pub record: SessionRecord, + /// `(offset, table_hash, algo)` for each block, HEAD first then members. + pub block_hashes: Vec<(u64, [u8; HASH_FIELD_SIZE], HashAlgo)>, + /// MEMBER block table_hashes in chain order (HEAD excluded). + pub member_hashes: Vec<[u8; HASH_FIELD_SIZE]>, + /// Every PFS_NODE record introduced by this session. + pub nodes: Vec, +} + +/// The result of scanning the whole chain. +#[derive(Debug, Clone, Default)] +pub struct Scan { + /// Sessions, newest first (strictly decreasing `seq`). + pub sessions: Vec, + /// PCF uid -> entry, for content lookup during reconstruction. + pub uid_index: HashMap<[u8; 16], PartitionEntry>, +} + +/// True iff the significant prefix of two hash fields matches for `algo`. +pub fn hash_eq(algo: HashAlgo, a: &[u8; HASH_FIELD_SIZE], b: &[u8; HASH_FIELD_SIZE]) -> bool { + let n = algo.digest_len(); + a[..n] == b[..n] +} + +/// Walk the backward-linked chain from the head, grouping blocks into sessions +/// (Section 11.2, spec R2/R3). +pub fn scan(c: &mut Container) -> Result { + let mut sessions = Vec::new(); + let mut uid_index: HashMap<[u8; 16], PartitionEntry> = HashMap::new(); + let mut last_seq: Option = None; + + let mut tbl = c.header().partition_table_offset; + while tbl != 0 { + let head = c.read_block_at(tbl)?; + let session_entries: Vec = head + .entries + .iter() + .filter(|e| e.partition_type == PFS_SESSION_TYPE) + .cloned() + .collect(); + if session_entries.len() != 1 { + return Err(Error::BrokenChain( + "HEAD block must hold exactly one PFS_SESSION", + )); + } + let sess_data = c.read_partition_data(&session_entries[0])?; + let record = SessionRecord::from_bytes(&sess_data)?; + + if let Some(prev) = last_seq { + if record.session_seq >= prev { + return Err(Error::BrokenChain("session_seq not strictly decreasing")); + } + } + last_seq = Some(record.session_seq); + + let mut all_entries = head.entries.clone(); + let mut block_hashes = vec![( + head.offset, + head.header.table_hash, + head.header.table_hash_algo, + )]; + let mut member_hashes: Vec<[u8; HASH_FIELD_SIZE]> = Vec::new(); + + let mut t = head.header.next_table_offset; + for _ in 1..record.block_count { + if t == 0 { + return Err(Error::BrokenChain( + "chain ended before block_count blocks were read", + )); + } + let mv = c.read_block_at(t)?; + if mv + .entries + .iter() + .any(|e| e.partition_type == PFS_SESSION_TYPE) + { + return Err(Error::BrokenChain("MEMBER block contains a PFS_SESSION")); + } + member_hashes.push(mv.header.table_hash); + block_hashes.push((mv.offset, mv.header.table_hash, mv.header.table_hash_algo)); + all_entries.extend(mv.entries.iter().cloned()); + t = mv.header.next_table_offset; + } + + // Index uids and parse node records; reject a node_id seen twice here. + let mut seen: HashMap<[u8; 16], ()> = HashMap::new(); + let mut nodes = Vec::new(); + for e in &all_entries { + uid_index.insert(e.uid, e.clone()); + if e.partition_type == PFS_NODE_TYPE { + let data = c.read_partition_data(e)?; + let rec = NodeRecord::from_bytes(&data)?; + if seen.insert(rec.node_id, ()).is_some() { + return Err(Error::DuplicateNodeInSession); + } + nodes.push(rec); + } + } + + sessions.push(SessionView { + seq: record.session_seq, + head_offset: head.offset, + record, + block_hashes, + member_hashes, + nodes, + }); + tbl = t; + } + + Ok(Scan { + sessions, + uid_index, + }) +} + +/// Verify the inter-session hash chain (Section 8.2, spec R8). Assumes the +/// container's own table/data hashes have already been verified via +/// [`pcf::Container::verify`]. +pub fn verify_chain(scan: &Scan) -> Result<()> { + for (i, s) in scan.sessions.iter().enumerate() { + // Member-block commitment. + let digest = member_blocks_digest(s.record.member_digest_algo, &s.member_hashes); + if !hash_eq( + s.record.member_digest_algo, + &digest, + &s.record.member_blocks_digest, + ) { + return Err(Error::ChainHashMismatch); + } + // Inter-session commitment: this session's prev_session_hash must equal + // the previous (older) session's HEAD block table_hash. + match scan.sessions.get(i + 1) { + Some(prev) => { + let (_, prev_head_hash, prev_head_algo) = prev.block_hashes[0]; + if s.record.prev_session_hash_algo != prev_head_algo + || !hash_eq(prev_head_algo, &s.record.prev_session_hash, &prev_head_hash) + { + return Err(Error::ChainHashMismatch); + } + } + None => { + // Oldest session: prev hash must be zero under algo None. + if s.record.prev_session_hash_algo != HashAlgo::None + || s.record.prev_session_hash != [0u8; HASH_FIELD_SIZE] + { + return Err(Error::ChainHashMismatch); + } + } + } + } + Ok(()) +} + +/// The resolved per-node state (Section 10.2). +#[derive(Debug, Clone, Default)] +pub struct NodeView { + /// node_id -> (winning session_seq, current record). Newest wins. + pub current: HashMap<[u8; 16], (u64, NodeRecord)>, + /// node_id -> records, descending by session_seq (for reconstruction). + pub history: HashMap<[u8; 16], Vec<(u64, NodeRecord)>>, +} + +/// Build the node view from a scan, optionally "as of" `max_seq` (inclusive), +/// implementing the history-query facility of Section 15. +pub fn build_node_view(scan: &Scan, max_seq: Option) -> NodeView { + let mut view = NodeView::default(); + // Ascending session_seq so "newest wins" falls out naturally. + let mut ordered: Vec<&SessionView> = scan + .sessions + .iter() + .filter(|s| max_seq.map(|m| s.seq <= m).unwrap_or(true)) + .collect(); + ordered.sort_by_key(|s| s.seq); + + for s in ordered { + for rec in &s.nodes { + view.history + .entry(rec.node_id) + .or_default() + .push((s.seq, rec.clone())); + view.current.insert(rec.node_id, (s.seq, rec.clone())); + } + } + for v in view.history.values_mut() { + v.sort_by_key(|b| std::cmp::Reverse(b.0)); // descending seq + } + view +} diff --git a/reference/PFS-MS-v1.0/src/session.rs b/reference/PFS-MS-v1.0/src/session.rs new file mode 100644 index 0000000..5d25b6d --- /dev/null +++ b/reference/PFS-MS-v1.0/src/session.rs @@ -0,0 +1,203 @@ +//! The Session Record stored as the data of the single `PFS_SESSION` partition +//! in a session's HEAD block (Section 8, Appendix A). + +use pcf::HashAlgo; + +use crate::consts::*; +use crate::error::{Error, Result}; + +/// A parsed Session Record. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SessionRecord { + /// Profile major version of the writer that produced this session. + pub profile_version_major: u8, + /// Profile minor version of the writer that produced this session. + pub profile_version_minor: u8, + /// 1-based, strictly increasing session number. + pub session_seq: u64, + /// Optional commit timestamp (0 = unspecified). + pub timestamp_unix_ms: u64, + /// Algorithm of `prev_session_hash` (0 for the first session). + pub prev_session_hash_algo: HashAlgo, + /// Previous session HEAD block's table_hash (zero for the first session). + pub prev_session_hash: [u8; HASH_FIELD_SIZE], + /// Number of Table Blocks in this session (>= 1). + pub block_count: u32, + /// Algorithm of `member_blocks_digest` (0 when block_count == 1). + pub member_digest_algo: HashAlgo, + /// Digest over this session's MEMBER block table_hashes (zero if none). + pub member_blocks_digest: [u8; HASH_FIELD_SIZE], + /// Number of PFS_NODE records in this session (informational). + pub change_count: u16, + /// Optional free-form writer identifier (UTF-8). + pub writer: Vec, +} + +fn rd_u16(b: &[u8]) -> u16 { + u16::from_le_bytes([b[0], b[1]]) +} +fn rd_u32(b: &[u8]) -> u32 { + u32::from_le_bytes([b[0], b[1], b[2], b[3]]) +} +fn rd_u64(b: &[u8]) -> u64 { + u64::from_le_bytes(b[0..8].try_into().unwrap()) +} + +impl SessionRecord { + /// Serialise to the on-disk layout (length `162 + writer_len`). + pub fn to_bytes(&self) -> Vec { + let mut b = Vec::with_capacity(SESSION_PREFIX_LEN + self.writer.len()); + b.extend_from_slice(&SESSION_MAGIC); + b.push(self.profile_version_major); + b.push(self.profile_version_minor); + b.extend_from_slice(&0u16.to_le_bytes()); // reserved + b.extend_from_slice(&self.session_seq.to_le_bytes()); + b.extend_from_slice(&self.timestamp_unix_ms.to_le_bytes()); + b.push(self.prev_session_hash_algo.id()); + b.extend_from_slice(&self.prev_session_hash); + b.extend_from_slice(&self.block_count.to_le_bytes()); + b.push(self.member_digest_algo.id()); + b.extend_from_slice(&self.member_blocks_digest); + b.extend_from_slice(&self.change_count.to_le_bytes()); + b.extend_from_slice(&(self.writer.len() as u16).to_le_bytes()); + b.extend_from_slice(&self.writer); + debug_assert_eq!(b.len(), SESSION_PREFIX_LEN + self.writer.len()); + b + } + + /// Parse and validate a Session Record (spec R3). + pub fn from_bytes(b: &[u8]) -> Result { + if b.len() < SESSION_PREFIX_LEN { + return Err(Error::MalformedSession("record shorter than fixed prefix")); + } + if b[0..4] != SESSION_MAGIC { + return Err(Error::MalformedSession("bad record_magic")); + } + let profile_version_major = b[4]; + if profile_version_major != PROFILE_VERSION_MAJOR { + return Err(Error::MalformedSession("unsupported profile major version")); + } + let profile_version_minor = b[5]; + let session_seq = rd_u64(&b[8..16]); + let timestamp_unix_ms = rd_u64(&b[16..24]); + let prev_session_hash_algo = HashAlgo::from_id(b[24])?; + let mut prev_session_hash = [0u8; HASH_FIELD_SIZE]; + prev_session_hash.copy_from_slice(&b[25..89]); + let block_count = rd_u32(&b[89..93]); + if block_count < 1 { + return Err(Error::MalformedSession("block_count must be >= 1")); + } + let member_digest_algo = HashAlgo::from_id(b[93])?; + let mut member_blocks_digest = [0u8; HASH_FIELD_SIZE]; + member_blocks_digest.copy_from_slice(&b[94..158]); + let change_count = rd_u16(&b[158..160]); + let writer_len = rd_u16(&b[160..162]) as usize; + if b.len() != SESSION_PREFIX_LEN + writer_len { + return Err(Error::MalformedSession( + "writer_len does not match record length", + )); + } + let writer = b[162..162 + writer_len].to_vec(); + + Ok(SessionRecord { + profile_version_major, + profile_version_minor, + session_seq, + timestamp_unix_ms, + prev_session_hash_algo, + prev_session_hash, + block_count, + member_digest_algo, + member_blocks_digest, + change_count, + writer, + }) + } +} + +/// Compute `member_blocks_digest = H(member[0].table_hash || member[1] || ...)` +/// over the stored 64-byte table_hash fields in chain-traversal order +/// (Section 8.2). With no member blocks the digest is 64 zero bytes under +/// algorithm `None`. +pub fn member_blocks_digest( + algo: HashAlgo, + member_table_hashes: &[[u8; HASH_FIELD_SIZE]], +) -> [u8; HASH_FIELD_SIZE] { + if member_table_hashes.is_empty() { + return [0u8; HASH_FIELD_SIZE]; + } + let mut image = Vec::with_capacity(member_table_hashes.len() * HASH_FIELD_SIZE); + for h in member_table_hashes { + image.extend_from_slice(h); + } + algo.compute(&image) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn session_roundtrip_no_members() { + let r = SessionRecord { + profile_version_major: PROFILE_VERSION_MAJOR, + profile_version_minor: PROFILE_VERSION_MINOR, + session_seq: 1, + timestamp_unix_ms: 0, + prev_session_hash_algo: HashAlgo::None, + prev_session_hash: [0u8; HASH_FIELD_SIZE], + block_count: 1, + member_digest_algo: HashAlgo::None, + member_blocks_digest: [0u8; HASH_FIELD_SIZE], + change_count: 3, + writer: b"pfs-ref".to_vec(), + }; + let bytes = r.to_bytes(); + assert_eq!(bytes.len(), SESSION_PREFIX_LEN + 7); + assert_eq!(SessionRecord::from_bytes(&bytes).unwrap(), r); + } + + #[test] + fn session_roundtrip_with_members() { + let m0 = HashAlgo::Sha256.compute(b"m0"); + let m1 = HashAlgo::Sha256.compute(b"m1"); + let digest = member_blocks_digest(HashAlgo::Sha256, &[m0, m1]); + let r = SessionRecord { + profile_version_major: PROFILE_VERSION_MAJOR, + profile_version_minor: PROFILE_VERSION_MINOR, + session_seq: 42, + timestamp_unix_ms: 7, + prev_session_hash_algo: HashAlgo::Sha256, + prev_session_hash: HashAlgo::Sha256.compute(b"prev"), + block_count: 3, + member_digest_algo: HashAlgo::Sha256, + member_blocks_digest: digest, + change_count: 600, + writer: Vec::new(), + }; + assert_eq!(SessionRecord::from_bytes(&r.to_bytes()).unwrap(), r); + } + + #[test] + fn rejects_bad_magic_and_length() { + let mut bytes = SessionRecord { + profile_version_major: PROFILE_VERSION_MAJOR, + profile_version_minor: PROFILE_VERSION_MINOR, + session_seq: 1, + timestamp_unix_ms: 0, + prev_session_hash_algo: HashAlgo::None, + prev_session_hash: [0u8; HASH_FIELD_SIZE], + block_count: 1, + member_digest_algo: HashAlgo::None, + member_blocks_digest: [0u8; HASH_FIELD_SIZE], + change_count: 0, + writer: Vec::new(), + } + .to_bytes(); + let good = bytes.clone(); + bytes[0] = 0; + assert!(SessionRecord::from_bytes(&bytes).is_err()); + // Truncated writer region. + assert!(SessionRecord::from_bytes(&good[..good.len() - 1]).is_err()); + } +} diff --git a/reference/PFS-MS-v1.0/src/tree.rs b/reference/PFS-MS-v1.0/src/tree.rs new file mode 100644 index 0000000..a02d08c --- /dev/null +++ b/reference/PFS-MS-v1.0/src/tree.rs @@ -0,0 +1,317 @@ +//! Filesystem semantics over a node view: liveness, the directory tree, path +//! resolution, and file content reconstruction (Sections 9.3, 10). + +use std::collections::{HashMap, HashSet}; +use std::io::{Read, Seek, Write}; + +use pcf::Container; + +use crate::consts::*; +use crate::error::{Error, Result}; +use crate::node::{ContentSection, NodeRecord}; +use crate::reader::{NodeView, Scan}; + +/// A directory's live children keyed by name, each mapped to the winning +/// `(session_seq, node_id)` used to resolve collisions (Section 10.3). +type SiblingNames = HashMap, (u64, [u8; 16])>; + +/// The reconstructed directory tree at a point in history. +#[derive(Debug, Clone, Default)] +pub struct Tree { + /// Live nodes by node_id (the current record for each). + pub nodes: HashMap<[u8; 16], NodeRecord>, + /// Live children (node_ids) of each live directory, name-deduplicated. + pub children: HashMap<[u8; 16], Vec<[u8; 16]>>, +} + +/// Memoised liveness with cycle detection (Section 10.2, spec R6). +struct Liveness<'a> { + view: &'a NodeView, + memo: HashMap<[u8; 16], bool>, +} + +impl<'a> Liveness<'a> { + fn new(view: &'a NodeView) -> Self { + Liveness { + view, + memo: HashMap::new(), + } + } + + fn is_live(&mut self, id: [u8; 16]) -> Result { + let mut stack: HashSet<[u8; 16]> = HashSet::new(); + self.walk(id, &mut stack) + } + + fn walk(&mut self, id: [u8; 16], stack: &mut HashSet<[u8; 16]>) -> Result { + if let Some(&v) = self.memo.get(&id) { + return Ok(v); + } + if !stack.insert(id) { + return Err(Error::ParentCycle); + } + + let result: Result = (|| { + if id == ROOT_NODE_ID { + // The root is live unless an explicit record tombstones it; if + // absent it is synthesized as a live empty directory. + return Ok(match self.view.current.get(&ROOT_NODE_ID) { + Some((_, r)) => !r.is_tombstone(), + None => true, + }); + } + let (_, rec) = match self.view.current.get(&id) { + Some(x) => x, + None => return Ok(false), + }; + if rec.is_tombstone() { + return Ok(false); + } + let parent = rec.parent_id; + // A non-root node parenting itself can never reach the root. + if parent == id { + return Ok(false); + } + if !self.walk(parent, stack)? { + return Ok(false); + } + // The parent must be a live directory. + let parent_is_dir = if parent == ROOT_NODE_ID { + true + } else { + self.view + .current + .get(&parent) + .map(|(_, r)| r.is_dir()) + .unwrap_or(false) + }; + Ok(parent_is_dir) + })(); + + stack.remove(&id); + let live = result?; + self.memo.insert(id, live); + Ok(live) + } +} + +/// True iff `id` resolves to a live node in `view`. +pub fn is_live(view: &NodeView, id: [u8; 16]) -> Result { + Liveness::new(view).is_live(id) +} + +/// Build the live directory tree, enforcing unique names among live siblings +/// (Section 10.3): on a collision the greater session_seq wins. +pub fn build_tree(view: &NodeView) -> Result { + let mut live = Liveness::new(view); + let mut tree = Tree::default(); + + // Synthesize the root if no explicit record exists. + let root_rec = match view.current.get(&ROOT_NODE_ID) { + Some((_, r)) => r.clone(), + None => NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: ROOT_NODE_ID, + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: Vec::new(), + content: None, + }, + }; + if !root_rec.is_tombstone() { + tree.nodes.insert(ROOT_NODE_ID, root_rec); + tree.children.entry(ROOT_NODE_ID).or_default(); + } + + // Collect every live node. + for (&id, (_, rec)) in view.current.iter() { + if id == ROOT_NODE_ID { + continue; + } + if live.is_live(id)? { + tree.nodes.insert(id, rec.clone()); + if rec.is_dir() { + tree.children.entry(id).or_default(); + } + } + } + + // Attach children to parents, resolving name collisions by greater seq. + // parent_id -> (name -> (winning_seq, winning_id)) + let mut by_parent: HashMap<[u8; 16], SiblingNames> = HashMap::new(); + for (&id, rec) in tree.nodes.iter() { + if id == ROOT_NODE_ID { + continue; + } + let seq = view.current.get(&id).map(|(s, _)| *s).unwrap_or(0); + let slot = by_parent.entry(rec.parent_id).or_default(); + match slot.get(&rec.name) { + Some(&(other_seq, _)) if other_seq >= seq => { /* keep existing winner */ } + _ => { + slot.insert(rec.name.clone(), (seq, id)); + } + } + } + for (parent, names) in by_parent { + let entry = tree.children.entry(parent).or_default(); + for (_, (_, id)) in names { + entry.push(id); + } + } + // Stable, name-sorted children for deterministic listings. + for kids in tree.children.values_mut() { + kids.sort_by(|a, b| { + let na = tree + .nodes + .get(a) + .map(|r| r.name.clone()) + .unwrap_or_default(); + let nb = tree + .nodes + .get(b) + .map(|r| r.name.clone()) + .unwrap_or_default(); + na.cmp(&nb) + }); + } + + Ok(tree) +} + +/// Resolve a '/'-separated path to a live node_id. "" or "/" is the root. +pub fn resolve_path(tree: &Tree, path: &str) -> Result<[u8; 16]> { + let mut cur = ROOT_NODE_ID; + if !tree.nodes.contains_key(&ROOT_NODE_ID) { + return Err(Error::NotFound); + } + for comp in path.split('/') { + if comp.is_empty() || comp == "." { + continue; + } + let kids = tree.children.get(&cur).ok_or(Error::NotADirectory)?; + let next = kids.iter().find(|id| { + tree.nodes + .get(*id) + .map(|r| r.name == comp.as_bytes()) + .unwrap_or(false) + }); + match next { + Some(&id) => cur = id, + None => return Err(Error::NotFound), + } + } + Ok(cur) +} + +/// Reconstruct the current content of a live file node (Section 9.3). +pub fn read_file( + c: &mut Container, + scan: &Scan, + view: &NodeView, + node_id: [u8; 16], +) -> Result> { + let history = view.history.get(&node_id).ok_or(Error::NotFound)?; + // Content-bearing versions (excludes tombstones/dirs), descending seq. + let chain: Vec<&NodeRecord> = history + .iter() + .filter_map(|(_, r)| if r.content.is_some() { Some(r) } else { None }) + .collect(); + if chain.is_empty() { + return Err(Error::NotFound); + } + materialize(c, scan, &chain, 0, 0) +} + +fn materialize( + c: &mut Container, + scan: &Scan, + chain: &[&NodeRecord], + k: usize, + depth: usize, +) -> Result> { + if depth > MIN_READER_DELTA_DEPTH.max(4096) { + return Err(Error::DeltaTooDeep); + } + let rec = chain.get(k).ok_or(Error::MissingBase)?; + let content = rec.content.as_ref().ok_or(Error::MissingBase)?; + match content { + ContentSection::Empty => Ok(Vec::new()), + ContentSection::Inherit => materialize(c, scan, chain, k + 1, depth + 1), + ContentSection::Direct { + compression_algo, + content_uid, + full_size, + full_hash_algo, + full_hash, + } => { + let entry = scan + .uid_index + .get(content_uid) + .ok_or(Error::MissingContent)? + .clone(); + let stored = c.read_partition_data(&entry)?; + if !entry.data_hash_algo.verify(&stored, &entry.data_hash) { + return Err(Error::ContentHashMismatch); + } + let data = crate::compress::decompress(*compression_algo, &stored)?; + if data.len() as u64 != *full_size || !full_hash_algo.verify(&data, full_hash) { + return Err(Error::ContentHashMismatch); + } + Ok(data) + } + ContentSection::Delta { + patch_algo, + compression_algo, + patch_uid, + full_size, + full_hash_algo, + full_hash, + base_full_size, + base_full_hash_algo, + base_full_hash, + } => { + let base = materialize(c, scan, chain, k + 1, depth + 1)?; + if base.len() as u64 != *base_full_size + || !base_full_hash_algo.verify(&base, base_full_hash) + { + return Err(Error::ContentHashMismatch); + } + let entry = scan + .uid_index + .get(patch_uid) + .ok_or(Error::MissingContent)? + .clone(); + let stored = c.read_partition_data(&entry)?; + if !entry.data_hash_algo.verify(&stored, &entry.data_hash) { + return Err(Error::ContentHashMismatch); + } + let patch = crate::compress::decompress(*compression_algo, &stored)?; + let bytes = crate::delta::apply(*patch_algo, &base, &patch)?; + if bytes.len() as u64 != *full_size || !full_hash_algo.verify(&bytes, full_hash) { + return Err(Error::ContentHashMismatch); + } + Ok(bytes) + } + } +} + +/// The current delta depth of a live file node: the number of consecutive +/// DELTA/INHERIT records before the first EMPTY/DIRECT (Section 9.4). Returns 0 +/// if the node has no content-bearing history. +pub fn current_delta_depth(view: &NodeView, node_id: [u8; 16]) -> usize { + let history = match view.history.get(&node_id) { + Some(h) => h, + None => return 0, + }; + let mut depth = 0; + for (_, r) in history.iter() { + match &r.content { + Some(ContentSection::Delta { .. }) | Some(ContentSection::Inherit) => depth += 1, + Some(_) => break, // EMPTY or DIRECT terminates the chain + None => continue, // tombstone/dir: skip + } + } + depth +} diff --git a/reference/PFS-MS-v1.0/src/vector.rs b/reference/PFS-MS-v1.0/src/vector.rs new file mode 100644 index 0000000..aae9029 --- /dev/null +++ b/reference/PFS-MS-v1.0/src/vector.rs @@ -0,0 +1,203 @@ +//! The canonical Section 17 reference vector. +//! +//! [`build_reference_vector`] deterministically constructs the three-session +//! scenario from the specification (Section 17) using fixed uids, fixed +//! node_ids, and a zero timestamp, so independent implementations can pin the +//! exact bytes. It uses the low-level [`FsWriter::commit`] API directly (rather +//! than the uuid/clock-driven high-level operations) precisely so the output is +//! reproducible. +//! +//! For illustration the first session stores a DEFLATE-compressed DIRECT +//! content (compression_algo_id = 1), and the second session is emitted as a +//! DELTA regardless of patch size, so the vector exercises both the compression +//! field and the DELTA content-section layout. + +use std::io::Cursor; + +use pcf::HashAlgo; + +use crate::compress::compress_deflate; +use crate::consts::*; +use crate::delta::diff_vcdiff; +use crate::node::{ContentSection, NodeRecord}; +use crate::writer::{FsWriter, Partition}; +use crate::Result; + +const ALGO: HashAlgo = HashAlgo::Sha256; + +fn id(b: u8) -> [u8; 16] { + [b; 16] +} + +/// hello.txt v1: a compressible payload so the DIRECT content is stored DEFLATE. +pub(crate) fn demo_v1() -> Vec { + b"Hello, PFS-MS! ".repeat(32) +} + +/// hello.txt v2: v1 with an appended line, reachable from v1 by a small patch. +pub(crate) fn demo_v2() -> Vec { + let mut v = demo_v1(); + v.extend_from_slice(b"...and now, hello world!\n"); + v +} + +/// Build the canonical PFS-MS reference file for the Section 17 scenario. +pub fn build_reference_vector() -> Result> { + let node_docs = id(0xD0); + let node_hello = id(0xF0); + + let mut w = FsWriter::create(Cursor::new(Vec::new()), ALGO)?; + + // ---- Session 1: root, docs/, hello.txt v1 (DIRECT, DEFLATE) ---------- + let v1 = demo_v1(); + let v1_stored = compress_deflate(&v1)?; // smaller than v1; stored compressed + let root = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: ROOT_NODE_ID, + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: Vec::new(), + content: None, + }; + let docs = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: node_docs, + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"docs".to_vec(), + content: None, + }; + let hello1 = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: node_hello, + parent_id: node_docs, + mtime_unix_ms: 0, + mode: 0, + name: b"hello.txt".to_vec(), + content: Some(ContentSection::Direct { + compression_algo: COMPRESS_DEFLATE, + content_uid: id(0x11), + full_size: v1.len() as u64, + full_hash_algo: ALGO, + full_hash: ALGO.compute(&v1), + }), + }; + w.commit( + vec![ + Partition::raw(id(0x11), "content", v1_stored), + Partition::node(id(0x21), &root), + Partition::node(id(0x22), &docs), + Partition::node(id(0x23), &hello1), + ], + id(0x31), + 3, + 0, + b"", + )?; + + // ---- Session 2: modify hello.txt to v2 (DELTA, patch stored verbatim) - + let v2 = demo_v2(); + let patch = diff_vcdiff(&v1, &v2)?; + let hello2 = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: node_hello, + parent_id: node_docs, + mtime_unix_ms: 0, + mode: 0, + name: b"hello.txt".to_vec(), + content: Some(ContentSection::Delta { + patch_algo: PATCH_VCDIFF, + compression_algo: COMPRESS_NONE, + patch_uid: id(0x12), + full_size: v2.len() as u64, + full_hash_algo: ALGO, + full_hash: ALGO.compute(&v2), + base_full_size: v1.len() as u64, + base_full_hash_algo: ALGO, + base_full_hash: ALGO.compute(&v1), + }), + }; + w.commit( + vec![ + Partition::raw(id(0x12), "patch", patch), + Partition::node(id(0x24), &hello2), + ], + id(0x32), + 1, + 0, + b"", + )?; + + // ---- Session 3: rename docs -> documents, tombstone hello.txt -------- + let documents = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: node_docs, + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"documents".to_vec(), + content: None, + }; + let hello_tomb = NodeRecord { + kind: KIND_FILE, + flags: FLAG_TOMBSTONE, + node_id: node_hello, + parent_id: node_docs, + mtime_unix_ms: 0, + mode: 0, + name: b"hello.txt".to_vec(), + content: None, + }; + w.commit( + vec![ + Partition::node(id(0x25), &documents), + Partition::node(id(0x26), &hello_tomb), + ], + id(0x33), + 2, + 0, + b"", + )?; + + Ok(w.into_storage().into_inner()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{FsReader, ROOT_NODE_ID}; + + #[test] + fn reference_vector_is_deterministic() { + let a = build_reference_vector().unwrap(); + let b = build_reference_vector().unwrap(); + assert_eq!(a, b, "the reference vector must be byte-reproducible"); + } + + #[test] + fn reference_vector_reconstructs() { + let bytes = build_reference_vector().unwrap(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + // At the head: /documents (renamed), hello.txt gone. + let tree = r.tree().unwrap(); + let kids: Vec = tree.children[&ROOT_NODE_ID] + .iter() + .map(|id| tree.nodes[id].name_str()) + .collect(); + assert_eq!(kids, vec!["documents".to_string()]); + // History query: hello.txt at session 2 reads the v2 payload, decoded + // from the DELTA patch applied to the DEFLATE-compressed v1 base. + assert_eq!( + r.read_path_as_of("docs/hello.txt", Some(2)).unwrap(), + demo_v2() + ); + } +} diff --git a/reference/PFS-MS-v1.0/src/writer.rs b/reference/PFS-MS-v1.0/src/writer.rs new file mode 100644 index 0000000..f49d7a8 --- /dev/null +++ b/reference/PFS-MS-v1.0/src/writer.rs @@ -0,0 +1,860 @@ +//! The append-only, multi-session writer (Sections 4, 6, 12). +//! +//! [`FsWriter`] operates directly on a `Read + Write + Seek` store using PCF's +//! pure serialization primitives ([`pcf::PartitionEntry`], +//! [`pcf::TableBlockHeader`], [`pcf::compute_table_hash`], [`pcf::FileHeader`]). +//! It never uses PCF's in-place `Container` writer, because PFS-MS requires +//! backward-linked Table Blocks and a single in-place header-pointer rewrite at +//! commit — neither of which the PCF writer performs. + +use std::collections::{HashMap, HashSet}; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use pcf::{ + compute_table_hash, encode_label, Container, FileHeader, HashAlgo, PartitionEntry, + TableBlockHeader, ENTRY_SIZE, HEADER_SIZE, MAX_ENTRIES_PER_BLOCK, TABLE_HEADER_SIZE, + VERSION_MAJOR, VERSION_MINOR, +}; + +use crate::consts::*; +use crate::error::{Error, Result}; +use crate::node::{ContentSection, NodeRecord}; +use crate::reader::{build_node_view, scan, NodeView, Scan}; +use crate::session::{member_blocks_digest, SessionRecord}; +use crate::tree::{build_tree, current_delta_depth, read_file, resolve_path, Tree}; + +/// One partition to publish in a session (RAW content, or a serialized record). +#[derive(Debug, Clone)] +pub struct Partition { + /// PCF partition type. + pub partition_type: u32, + /// PCF uid (must be unique and non-NIL). + pub uid: [u8; 16], + /// 32-byte PCF label field. + pub label: [u8; 32], + /// Partition data bytes. + pub data: Vec, +} + +impl Partition { + /// A RAW content partition (full bytes or a delta patch). + pub fn raw(uid: [u8; 16], label: &str, data: Vec) -> Self { + Partition { + partition_type: RAW_TYPE, + uid, + label: lbl(label), + data, + } + } + /// A PFS_NODE partition carrying one serialized Node Record. + pub fn node(uid: [u8; 16], record: &NodeRecord) -> Self { + Partition { + partition_type: PFS_NODE_TYPE, + uid, + label: lbl("node"), + data: record.to_bytes(), + } + } +} + +fn lbl(s: &str) -> [u8; 32] { + encode_label(s).expect("static label is valid") +} + +/// One declarative change applied to the filesystem within a single session by +/// [`FsWriter::commit_changes`]. Paths are '/'-separated, relative to the root. +#[derive(Debug, Clone)] +pub enum Change { + /// Ensure a directory exists at `path` (a no-op if it already does). + Mkdir { + /// Directory path. + path: String, + /// POSIX permission bits (0 = unset). + mode: u32, + /// Modification time in unix milliseconds (0 = unspecified). + mtime_unix_ms: u64, + }, + /// Create or replace the file at `path` with `content`. + PutFile { + /// File path. + path: String, + /// New file content. + content: Vec, + /// POSIX permission bits (0 = unset). + mode: u32, + /// Modification time in unix milliseconds (0 = unspecified). + mtime_unix_ms: u64, + }, + /// Delete the node at `path` (recursive by ancestry for directories). + Remove { + /// Path to delete. + path: String, + }, +} + +/// Normalise a '/'-separated path: drop empty, '.', leading/trailing segments. +fn norm_path(path: &str) -> String { + path.split('/') + .filter(|c| !c.is_empty() && *c != ".") + .collect::>() + .join("/") +} + +/// Split a normalised path into (parent path, final component). +fn split_parent(path: &str) -> (String, &str) { + match path.rsplit_once('/') { + Some((p, n)) => (p.to_string(), n), + None => (String::new(), path), + } +} + +/// A fresh 16-byte identifier (UUIDv7, recommended by both specs). +pub fn new_id() -> [u8; 16] { + *uuid::Uuid::now_v7().as_bytes() +} + +fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + +/// An append-only PFS-MS writer. +pub struct FsWriter { + storage: S, + hash_algo: HashAlgo, + /// Offset of the current committed HEAD block (0 if none yet). + head_offset: u64, + prev_head_hash: [u8; HASH_FIELD_SIZE], + prev_head_algo: HashAlgo, + next_seq: u64, + eof: u64, + writer_id: Vec, + compress: bool, +} + +impl FsWriter { + /// Create an empty container (no sessions yet). The header points nowhere + /// (`partition_table_offset = 0`) until the first session is committed; a + /// reader of this transient state sees an empty filesystem. + pub fn create(mut storage: S, hash_algo: HashAlgo) -> Result { + let header = FileHeader { + version_major: VERSION_MAJOR, + version_minor: VERSION_MINOR, + partition_table_offset: 0, + }; + storage.seek(SeekFrom::Start(0))?; + storage.write_all(&header.to_bytes())?; + storage.flush()?; + Ok(Self { + storage, + hash_algo, + head_offset: 0, + prev_head_hash: [0u8; HASH_FIELD_SIZE], + prev_head_algo: HashAlgo::None, + next_seq: 1, + eof: HEADER_SIZE, + writer_id: b"pfs-ms-ref/1.0".to_vec(), + compress: true, + }) + } + + /// Create a new filesystem and commit session 1 with an explicit root + /// directory record (Section 10.1). + pub fn mkfs(storage: S, hash_algo: HashAlgo) -> Result { + let mut w = Self::create(storage, hash_algo)?; + let root = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: ROOT_NODE_ID, + parent_id: ROOT_NODE_ID, + mtime_unix_ms: now_ms(), + mode: 0, + name: Vec::new(), + content: None, + }; + let part = Partition::node(new_id(), &root); + let wid = w.writer_id.clone(); + w.commit(vec![part], new_id(), 1, now_ms(), &wid)?; + Ok(w) + } + + /// Reopen an existing PFS-MS file for appending further sessions. + pub fn open(mut storage: S) -> Result { + let (head_offset, prev_head_hash, prev_head_algo, next_seq, hash_algo) = { + let mut c = Container::open(&mut storage)?; + let head = c.header().partition_table_offset; + if head == 0 { + ( + 0, + [0u8; HASH_FIELD_SIZE], + HashAlgo::None, + 1, + HashAlgo::Sha256, + ) + } else { + let hv = c.read_block_at(head)?; + let sess_entry = hv + .entries + .iter() + .find(|e| e.partition_type == PFS_SESSION_TYPE) + .ok_or(Error::BrokenChain("HEAD block has no PFS_SESSION"))? + .clone(); + let data = c.read_partition_data(&sess_entry)?; + let rec = SessionRecord::from_bytes(&data)?; + ( + head, + hv.header.table_hash, + hv.header.table_hash_algo, + rec.session_seq + 1, + hv.header.table_hash_algo, + ) + } + }; + let eof = storage.seek(SeekFrom::End(0))?; + Ok(Self { + storage, + hash_algo, + head_offset, + prev_head_hash, + prev_head_algo, + next_seq, + eof, + writer_id: b"pfs-ms-ref/1.0".to_vec(), + compress: true, + }) + } + + /// Set the free-form writer identifier recorded in each session. + pub fn set_writer_id(&mut self, id: &[u8]) { + self.writer_id = id.to_vec(); + } + + /// Enable or disable content compression for subsequent writes. When + /// disabled, content and patches are always stored verbatim + /// (compression_algo_id = 0). Compression is enabled by default. + pub fn set_compression(&mut self, enabled: bool) { + self.compress = enabled; + } + + /// Consume the writer and return the backing store. + pub fn into_storage(self) -> S { + self.storage + } + + /// The current committed head offset (0 before the first commit). + pub fn head_offset(&self) -> u64 { + self.head_offset + } + + /// The session_seq that the next commit will use. + pub fn next_seq(&self) -> u64 { + self.next_seq + } + + // ---- low-level I/O ---------------------------------------------------- + + fn write_at(&mut self, off: u64, buf: &[u8]) -> Result<()> { + self.storage.seek(SeekFrom::Start(off))?; + self.storage.write_all(buf)?; + Ok(()) + } + + fn write_block( + &mut self, + off: u64, + next: u64, + algo: HashAlgo, + entries: &[PartitionEntry], + ) -> Result<[u8; HASH_FIELD_SIZE]> { + let hash = compute_table_hash(algo, next, entries); + let header = TableBlockHeader { + partition_count: entries.len() as u8, + next_table_offset: next, + table_hash_algo: algo, + table_hash: hash, + }; + self.write_at(off, &header.to_bytes())?; + let mut buf = Vec::with_capacity(entries.len() * ENTRY_SIZE as usize); + for e in entries { + buf.extend_from_slice(&e.to_bytes()); + } + self.write_at(off + TABLE_HEADER_SIZE, &buf)?; + Ok(hash) + } + + // ---- the commit protocol (Section 6.2) -------------------------------- + + /// Publish one session containing `parts` (RAW content and PFS_NODE + /// partitions) plus an internally built PFS_SESSION partition. Follows the + /// commit protocol S1..S7: append all data and blocks beyond the live + /// chain, then atomically repoint the header. + pub fn commit( + &mut self, + parts: Vec, + session_uid: [u8; 16], + change_count: u16, + timestamp: u64, + writer: &[u8], + ) -> Result<()> { + let algo = self.hash_algo; + let first_session = self.head_offset == 0; + let prev_head = self.head_offset; // 0 if first session + + // S1/S2: write content + node partition data; build their entries. + let mut non_session: Vec = Vec::with_capacity(parts.len()); + for p in &parts { + let start = self.eof; + if !p.data.is_empty() { + self.write_at(start, &p.data)?; + } + self.eof += p.data.len() as u64; + non_session.push(PartitionEntry { + partition_type: p.partition_type, + uid: p.uid, + label: p.label, + start_offset: start, + max_length: p.data.len() as u64, + used_bytes: p.data.len() as u64, + data_hash_algo: algo, + data_hash: algo.compute(&p.data), + }); + } + + // Reserve the PFS_SESSION data region (length is known up front). + let session_len = SESSION_PREFIX_LEN + writer.len(); + let session_start = self.eof; + self.eof += session_len as u64; + + // Split entries into blocks: the PFS_SESSION entry plus up to 254 + // others in the HEAD block; the rest in MEMBER blocks of <=255. + let head_other_cap = (MAX_ENTRIES_PER_BLOCK as usize) - 1; + let head_take = non_session.len().min(head_other_cap); + let (head_others, rest) = non_session.split_at(head_take); + let member_chunks: Vec<&[PartitionEntry]> = + rest.chunks(MAX_ENTRIES_PER_BLOCK as usize).collect(); + let block_count = 1 + member_chunks.len(); + + // S4 (offsets): MEMBER blocks first (chain order), then the HEAD block. + let mut member_offsets = Vec::with_capacity(member_chunks.len()); + for chunk in &member_chunks { + member_offsets.push(self.eof); + self.eof += TABLE_HEADER_SIZE + chunk.len() as u64 * ENTRY_SIZE; + } + let head_offset = self.eof; + let head_count = 1 + head_others.len(); + self.eof += TABLE_HEADER_SIZE + head_count as u64 * ENTRY_SIZE; + + // Chain: HEAD -> m0 -> m1 -> ... -> m_{k-1} -> prev_head (or 0). + let mut member_nexts = Vec::with_capacity(member_chunks.len()); + let mut member_hashes = Vec::with_capacity(member_chunks.len()); + for i in 0..member_chunks.len() { + let next = if i + 1 < member_chunks.len() { + member_offsets[i + 1] + } else { + prev_head + }; + member_nexts.push(next); + member_hashes.push(compute_table_hash(algo, next, member_chunks[i])); + } + let head_next = member_offsets.first().copied().unwrap_or(prev_head); + + // S2/S3: build and write the PFS_SESSION record + entry. + let (prev_algo, prev_hash) = if first_session { + (HashAlgo::None, [0u8; HASH_FIELD_SIZE]) + } else { + (self.prev_head_algo, self.prev_head_hash) + }; + let (mdigest_algo, mdigest) = if member_chunks.is_empty() { + (HashAlgo::None, [0u8; HASH_FIELD_SIZE]) + } else { + (algo, member_blocks_digest(algo, &member_hashes)) + }; + let session_rec = SessionRecord { + profile_version_major: PROFILE_VERSION_MAJOR, + profile_version_minor: PROFILE_VERSION_MINOR, + session_seq: self.next_seq, + timestamp_unix_ms: timestamp, + prev_session_hash_algo: prev_algo, + prev_session_hash: prev_hash, + block_count: block_count as u32, + member_digest_algo: mdigest_algo, + member_blocks_digest: mdigest, + change_count, + writer: writer.to_vec(), + }; + let session_bytes = session_rec.to_bytes(); + debug_assert_eq!(session_bytes.len(), session_len); + self.write_at(session_start, &session_bytes)?; + let session_entry = PartitionEntry { + partition_type: PFS_SESSION_TYPE, + uid: session_uid, + label: lbl("session"), + start_offset: session_start, + max_length: session_len as u64, + used_bytes: session_len as u64, + data_hash_algo: algo, + data_hash: algo.compute(&session_bytes), + }; + + // S4: write MEMBER blocks first, then the HEAD block last (its + // table_hash commits to the member digest via the session record). + for i in 0..member_chunks.len() { + self.write_block(member_offsets[i], member_nexts[i], algo, member_chunks[i])?; + } + let mut head_entries = Vec::with_capacity(head_count); + head_entries.push(session_entry); + head_entries.extend_from_slice(head_others); + let head_hash = self.write_block(head_offset, head_next, algo, &head_entries)?; + + // S5: flush data + blocks before publishing. + self.storage.flush()?; + // S6: the single permitted in-place write — the 8-byte header pointer. + self.write_at(12, &head_offset.to_le_bytes())?; + // S7: flush the header. + self.storage.flush()?; + + // Advance writer state. + self.head_offset = head_offset; + self.prev_head_hash = head_hash; + self.prev_head_algo = algo; + self.next_seq += 1; + Ok(()) + } + + // ---- high-level filesystem operations (Section 10.4) ------------------ + + fn snapshot(&mut self) -> Result<(Scan, NodeView, Tree)> { + let scan = { + let mut c = Container::open(&mut self.storage)?; + scan(&mut c)? + }; + let view = build_node_view(&scan, None); + let tree = build_tree(&view)?; + Ok((scan, view, tree)) + } + + fn current_content(&mut self, node_id: [u8; 16]) -> Result>> { + let scan = { + let mut c = Container::open(&mut self.storage)?; + scan(&mut c)? + }; + let view = build_node_view(&scan, None); + if !view.history.contains_key(&node_id) { + return Ok(None); + } + let mut c = Container::open(&mut self.storage)?; + match read_file(&mut c, &scan, &view, node_id) { + Ok(b) => Ok(Some(b)), + Err(Error::NotFound) => Ok(None), + Err(e) => Err(e), + } + } + + /// Split a path into (live parent directory id, final name bytes). + fn resolve_parent(&self, tree: &Tree, path: &str) -> Result<([u8; 16], Vec)> { + let trimmed = path.trim_end_matches('/'); + let (parent_path, name) = match trimmed.rsplit_once('/') { + Some((p, n)) => (p, n), + None => ("", trimmed), + }; + if name.is_empty() || name == "." || name == ".." { + return Err(Error::InvalidPath("illegal final component")); + } + if name.as_bytes().contains(&0) || name.len() > PFS_MAX_NAME { + return Err(Error::InvalidPath("illegal name")); + } + let parent_id = resolve_path(tree, parent_path)?; + let parent = tree.nodes.get(&parent_id).ok_or(Error::NotFound)?; + if !parent.is_dir() { + return Err(Error::NotADirectory); + } + Ok((parent_id, name.as_bytes().to_vec())) + } + + fn live_child(tree: &Tree, parent_id: [u8; 16], name: &[u8]) -> Option<[u8; 16]> { + tree.children.get(&parent_id).and_then(|kids| { + kids.iter() + .find(|id| tree.nodes.get(*id).map(|r| r.name == name).unwrap_or(false)) + .copied() + }) + } + + /// Create a directory at `path` (Section 10.4). + pub fn mkdir(&mut self, path: &str) -> Result<()> { + let (_, _, tree) = self.snapshot()?; + let (parent_id, name) = self.resolve_parent(&tree, path)?; + if Self::live_child(&tree, parent_id, &name).is_some() { + return Err(Error::AlreadyExists); + } + let rec = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: new_id(), + parent_id, + mtime_unix_ms: now_ms(), + mode: 0, + name, + content: None, + }; + let part = Partition::node(new_id(), &rec); + let wid = self.writer_id.clone(); + self.commit(vec![part], new_id(), 1, now_ms(), &wid) + } + + /// Create or modify the file at `path` with `content` (Section 10.4), + /// choosing DIRECT vs DELTA automatically (Sections 9.2, 9.4). + pub fn put_file(&mut self, path: &str, content: &[u8]) -> Result<()> { + let (_, view, tree) = self.snapshot()?; + let (parent_id, name) = self.resolve_parent(&tree, path)?; + + let mut parts: Vec = Vec::new(); + let existing = Self::live_child(&tree, parent_id, &name); + let node_id; + let content_section; + + match existing { + Some(id) => { + let rec = tree.nodes.get(&id).ok_or(Error::NotFound)?; + if rec.is_dir() { + return Err(Error::NotADirectory); + } + node_id = id; + let prev = self.current_content(id)?.unwrap_or_default(); + content_section = + self.build_modified_content(&mut parts, &prev, content, &view, id); + } + None => { + node_id = new_id(); + content_section = self.build_new_content(&mut parts, content); + } + } + + let rec = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id, + parent_id, + mtime_unix_ms: now_ms(), + mode: 0, + name, + content: Some(content_section), + }; + parts.push(Partition::node(new_id(), &rec)); + let wid = self.writer_id.clone(); + self.commit(parts, new_id(), 1, now_ms(), &wid) + } + + /// DEFLATE `bytes` and return the smaller of (compressed, verbatim) along + /// with the `compression_algo_id` describing the chosen form (Section 9.5). + /// Returns the verbatim bytes when compression is disabled or not smaller. + fn maybe_compress(&self, bytes: &[u8]) -> (u8, Vec) { + if self.compress { + if let Ok(packed) = crate::compress::compress_deflate(bytes) { + if packed.len() < bytes.len() { + return (COMPRESS_DEFLATE, packed); + } + } + } + (COMPRESS_NONE, bytes.to_vec()) + } + + fn build_new_content(&self, parts: &mut Vec, content: &[u8]) -> ContentSection { + let algo = self.hash_algo; + if content.is_empty() { + return ContentSection::Empty; + } + let content_uid = new_id(); + let (compression_algo, stored) = self.maybe_compress(content); + parts.push(Partition::raw(content_uid, "content", stored)); + ContentSection::Direct { + compression_algo, + content_uid, + full_size: content.len() as u64, + full_hash_algo: algo, + full_hash: algo.compute(content), + } + } + + fn build_modified_content( + &self, + parts: &mut Vec, + prev: &[u8], + content: &[u8], + view: &NodeView, + node_id: [u8; 16], + ) -> ContentSection { + let algo = self.hash_algo; + if content.is_empty() { + return ContentSection::Empty; + } + // Prefer DELTA only when a smaller patch exists and re-baselining is not + // yet required (Section 9.4). + if !prev.is_empty() { + if let Ok(patch) = crate::delta::diff_vcdiff(prev, content) { + let depth = current_delta_depth(view, node_id); + if patch.len() < content.len() && depth < RECOMMENDED_MAX_DELTA_DEPTH { + let patch_uid = new_id(); + let (compression_algo, stored) = self.maybe_compress(&patch); + parts.push(Partition::raw(patch_uid, "patch", stored)); + return ContentSection::Delta { + patch_algo: PATCH_VCDIFF, + compression_algo, + patch_uid, + full_size: content.len() as u64, + full_hash_algo: algo, + full_hash: algo.compute(content), + base_full_size: prev.len() as u64, + base_full_hash_algo: algo, + base_full_hash: algo.compute(prev), + }; + } + } + } + self.build_new_content(parts, content) + } + + /// Move and/or rename `src` to `dst` (Section 10.4). A file carries INHERIT + /// content so its bytes are preserved without a copy. + pub fn mv(&mut self, src: &str, dst: &str) -> Result<()> { + let (_, _, tree) = self.snapshot()?; + let src_id = resolve_path(&tree, src)?; + if src_id == ROOT_NODE_ID { + return Err(Error::InvalidPath("cannot move the root")); + } + let src_rec = tree.nodes.get(&src_id).ok_or(Error::NotFound)?.clone(); + let (parent_id, name) = self.resolve_parent(&tree, dst)?; + if Self::live_child(&tree, parent_id, &name).is_some() { + return Err(Error::AlreadyExists); + } + let content = if src_rec.is_file() { + Some(ContentSection::Inherit) + } else { + None + }; + let rec = NodeRecord { + kind: src_rec.kind, + flags: 0, + node_id: src_id, + parent_id, + mtime_unix_ms: now_ms(), + mode: src_rec.mode, + name, + content, + }; + let part = Partition::node(new_id(), &rec); + let wid = self.writer_id.clone(); + self.commit(vec![part], new_id(), 1, now_ms(), &wid) + } + + /// Delete the node at `path` by tombstone (Section 10.4). Directory + /// deletion is recursive by ancestry (Section 10.2). + pub fn rm(&mut self, path: &str) -> Result<()> { + let (_, _, tree) = self.snapshot()?; + let id = resolve_path(&tree, path)?; + if id == ROOT_NODE_ID { + return Err(Error::InvalidPath("cannot delete the root")); + } + let rec = tree.nodes.get(&id).ok_or(Error::NotFound)?.clone(); + let tomb = NodeRecord { + kind: rec.kind, + flags: FLAG_TOMBSTONE, + node_id: id, + parent_id: rec.parent_id, + mtime_unix_ms: now_ms(), + mode: 0, + name: rec.name, + content: None, + }; + let part = Partition::node(new_id(), &tomb); + let wid = self.writer_id.clone(); + self.commit(vec![part], new_id(), 1, now_ms(), &wid) + } + + /// Apply a batch of [`Change`]s as a single session (one "burn"). Used by + /// the directory-level tooling so importing a whole tree is one session + /// rather than one per file. Unchanged files and already-existing + /// directories produce no record; if nothing changes, no session is + /// committed. + pub fn commit_changes(&mut self, changes: &[Change]) -> Result<()> { + let (_, view, tree) = self.snapshot()?; + + // Directory path -> node_id, for resolving parents of new entries. + // Extended with directories created earlier in this same batch. + let mut dir_ids: HashMap = HashMap::new(); + dir_ids.insert(String::new(), ROOT_NODE_ID); + + // Resolve a directory path to a node_id (committed tree or this batch). + let resolve_dir = + |tree: &Tree, dir_ids: &HashMap, path: &str| -> Option<[u8; 16]> { + if let Some(id) = dir_ids.get(path) { + return Some(*id); + } + let id = resolve_path(tree, path).ok()?; + if tree.nodes.get(&id).map(|r| r.is_dir()).unwrap_or(false) { + Some(id) + } else { + None + } + }; + + let mut parts: Vec = Vec::new(); + let mut used: HashSet<[u8; 16]> = HashSet::new(); + let mut records = 0usize; + + // Order so parents precede children: shallow Mkdir, then PutFile, then + // Remove (deepest first, so a child can be tombstoned before its dir). + let mut mkdirs: Vec<&Change> = Vec::new(); + let mut puts: Vec<&Change> = Vec::new(); + let mut removes: Vec<&Change> = Vec::new(); + for c in changes { + match c { + Change::Mkdir { .. } => mkdirs.push(c), + Change::PutFile { .. } => puts.push(c), + Change::Remove { .. } => removes.push(c), + } + } + let depth = |p: &str| p.matches('/').count(); + mkdirs.sort_by_key(|c| match c { + Change::Mkdir { path, .. } => depth(&norm_path(path)), + _ => 0, + }); + removes.sort_by_key(|c| match c { + Change::Remove { path } => std::cmp::Reverse(depth(&norm_path(path))), + _ => std::cmp::Reverse(0), + }); + + let mark = |id: [u8; 16], used: &mut HashSet<[u8; 16]>| -> Result<()> { + if !used.insert(id) { + return Err(Error::DuplicateNodeInSession); + } + Ok(()) + }; + + // ---- directories ---- + for c in mkdirs { + let (path, mode, mtime) = match c { + Change::Mkdir { + path, + mode, + mtime_unix_ms, + } => (norm_path(path), *mode, *mtime_unix_ms), + _ => unreachable!(), + }; + if path.is_empty() { + continue; // the root always exists + } + // Already a live directory? Just register it. + if let Some(id) = resolve_dir(&tree, &dir_ids, &path) { + dir_ids.insert(path, id); + continue; + } + // A live non-directory in the way is a conflict. + if resolve_path(&tree, &path).is_ok() { + return Err(Error::NotADirectory); + } + let (parent, name) = split_parent(&path); + let parent_id = resolve_dir(&tree, &dir_ids, &parent).ok_or(Error::NotFound)?; + let node_id = new_id(); + mark(node_id, &mut used)?; + let rec = NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id, + parent_id, + mtime_unix_ms: mtime, + mode, + name: name.as_bytes().to_vec(), + content: None, + }; + parts.push(Partition::node(new_id(), &rec)); + records += 1; + dir_ids.insert(path, node_id); + } + + // ---- files ---- + for c in puts { + let (path, content, mode, mtime) = match c { + Change::PutFile { + path, + content, + mode, + mtime_unix_ms, + } => (norm_path(path), content, *mode, *mtime_unix_ms), + _ => unreachable!(), + }; + let (parent, name) = split_parent(&path); + let parent_id = resolve_dir(&tree, &dir_ids, &parent).ok_or(Error::NotFound)?; + let name = name.as_bytes().to_vec(); + + // An existing live file under a committed parent is modified in + // place (same node_id); anything under a freshly created directory + // is necessarily new. + let existing = Self::live_child(&tree, parent_id, &name); + let (node_id, content_section) = match existing { + Some(id) => { + if tree.nodes.get(&id).map(|r| r.is_dir()).unwrap_or(false) { + return Err(Error::NotADirectory); + } + let prev = self.current_content(id)?.unwrap_or_default(); + if prev == *content { + continue; // unchanged: no record + } + ( + id, + self.build_modified_content(&mut parts, &prev, content, &view, id), + ) + } + None => (new_id(), self.build_new_content(&mut parts, content)), + }; + mark(node_id, &mut used)?; + let rec = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id, + parent_id, + mtime_unix_ms: mtime, + mode, + name, + content: Some(content_section), + }; + parts.push(Partition::node(new_id(), &rec)); + records += 1; + } + + // ---- removals ---- + for c in removes { + let path = match c { + Change::Remove { path } => norm_path(path), + _ => unreachable!(), + }; + let id = resolve_path(&tree, &path)?; + if id == ROOT_NODE_ID { + return Err(Error::InvalidPath("cannot delete the root")); + } + let rec = tree.nodes.get(&id).ok_or(Error::NotFound)?.clone(); + mark(id, &mut used)?; + let tomb = NodeRecord { + kind: rec.kind, + flags: FLAG_TOMBSTONE, + node_id: id, + parent_id: rec.parent_id, + mtime_unix_ms: now_ms(), + mode: 0, + name: rec.name, + content: None, + }; + parts.push(Partition::node(new_id(), &tomb)); + records += 1; + } + + if records == 0 { + return Ok(()); // nothing changed; no session + } + let wid = self.writer_id.clone(); + let change_count = records.min(u16::MAX as usize) as u16; + self.commit(parts, new_id(), change_count, now_ms(), &wid) + } +} diff --git a/reference/PFS-MS-v1.0/tests/coverage.rs b/reference/PFS-MS-v1.0/tests/coverage.rs new file mode 100644 index 0000000..7a86228 --- /dev/null +++ b/reference/PFS-MS-v1.0/tests/coverage.rs @@ -0,0 +1,353 @@ +//! Targeted error-path and edge-case tests for `pfs-ms`. + +use std::io::Cursor; + +use pcf::HashAlgo; +use pfs_ms::{ + build_node_view, current_delta_depth, is_live, resolve_path, ContentSection, Error, FsReader, + FsWriter, NodeRecord, Partition, SessionRecord, KIND_FILE, ROOT_NODE_ID, +}; + +fn id(b: u8) -> [u8; 16] { + [b; 16] +} + +// ---- record parsing edges ------------------------------------------------ + +#[test] +fn node_parse_rejects_garbage_and_truncation() { + assert!(matches!( + NodeRecord::from_bytes(b"short"), + Err(Error::MalformedNode(_)) + )); + // Good prefix but bogus content kind. + let r = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: id(1), + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"f".to_vec(), + content: Some(ContentSection::Empty), + }; + let mut bytes = r.to_bytes(); + *bytes.last_mut().unwrap() = 0x77; // unknown content_kind + assert!(matches!( + NodeRecord::from_bytes(&bytes), + Err(Error::MalformedNode(_)) + )); +} + +#[test] +fn session_parse_rejects_bad_block_count_field() { + let mut rec = SessionRecord { + profile_version_major: 1, + profile_version_minor: 0, + session_seq: 1, + timestamp_unix_ms: 0, + prev_session_hash_algo: HashAlgo::None, + prev_session_hash: [0u8; 64], + block_count: 1, + member_digest_algo: HashAlgo::None, + member_blocks_digest: [0u8; 64], + change_count: 0, + writer: Vec::new(), + }; + rec.block_count = 1; + let mut bytes = rec.to_bytes(); + // Zero out block_count (offset 89..93) -> must be rejected (>= 1). + bytes[89] = 0; + bytes[90] = 0; + bytes[91] = 0; + bytes[92] = 0; + assert!(matches!( + SessionRecord::from_bytes(&bytes), + Err(Error::MalformedSession(_)) + )); +} + +// ---- content kinds ------------------------------------------------------- + +#[test] +fn empty_and_inherit_reconstruct() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("empty.txt", b"").unwrap(); // EMPTY + w.mkdir("d").unwrap(); + w.mv("empty.txt", "d/moved.txt").unwrap(); // INHERIT over EMPTY + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("d/moved.txt").unwrap(), b""); +} + +// ---- liveness / tree edges ---------------------------------------------- + +#[test] +fn resolve_path_errors() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("a.txt", b"x").unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + let tree = r.tree().unwrap(); + assert!(matches!(resolve_path(&tree, "nope"), Err(Error::NotFound))); + // Descending into a file is "not a directory". + assert!(matches!( + resolve_path(&tree, "a.txt/inner"), + Err(Error::NotADirectory) + )); + assert!(is_live(&build_node_view(&r.scan().unwrap(), None), ROOT_NODE_ID).unwrap()); +} + +#[test] +fn name_collision_keeps_greater_seq() { + // Two live siblings with the same name (forced via low-level commit): the + // greater session_seq wins (Section 10.3, resilience rule). + let mut w = FsWriter::create(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let a = id(0xA1); + let b = id(0xB1); + let mk = |node_id, content: &'static [u8]| NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id, + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"dup.txt".to_vec(), + content: Some(ContentSection::Direct { + compression_algo: pfs_ms::COMPRESS_NONE, + content_uid: if node_id == a { id(0xC1) } else { id(0xC2) }, + full_size: content.len() as u64, + full_hash_algo: HashAlgo::Sha256, + full_hash: HashAlgo::Sha256.compute(content), + }), + }; + // Session 1: node a = "old". + w.commit( + vec![ + Partition::raw(id(0xC1), "c", b"old".to_vec()), + Partition::node(id(0x01), &mk(a, b"old")), + ], + id(0x31), + 1, + 0, + b"", + ) + .unwrap(); + // Session 2: node b, same name = "new" (greater seq wins). + w.commit( + vec![ + Partition::raw(id(0xC2), "c", b"new".to_vec()), + Partition::node(id(0x02), &mk(b, b"new")), + ], + id(0x32), + 1, + 0, + b"", + ) + .unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("dup.txt").unwrap(), b"new"); +} + +// ---- delta depth --------------------------------------------------------- + +#[test] +fn delta_depth_grows_then_rebaselines() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let base = b"line\n".repeat(50); + w.put_file("f", &base).unwrap(); + // Many small edits should accumulate DELTA depth, then re-baseline at 16. + for i in 0..40u32 { + let mut v = base.clone(); + v.extend_from_slice(format!("edit {i}\n").as_bytes()); + w.put_file("f", &v).unwrap(); + } + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + let view = build_node_view(&r.scan().unwrap(), None); + // Find the file's node_id from the head tree. + let tree = r.tree().unwrap(); + let fid = *tree.children[&ROOT_NODE_ID] + .iter() + .find(|id| tree.nodes[*id].name == b"f") + .unwrap(); + // Re-baselining keeps the live depth bounded by the recommended maximum. + assert!(current_delta_depth(&view, fid) <= pfs_ms::RECOMMENDED_MAX_DELTA_DEPTH); +} + +// ---- writer guard rails -------------------------------------------------- + +#[test] +fn writer_rejects_obvious_mistakes() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.mkdir("d").unwrap(); + assert!(matches!(w.mkdir("d"), Err(Error::AlreadyExists))); + w.put_file("d/f", b"x").unwrap(); + // Writing a file where a directory exists is rejected. + assert!(matches!(w.put_file("d", b"x"), Err(Error::NotADirectory))); + assert!(matches!(w.rm("/"), Err(Error::InvalidPath(_)))); + assert!(matches!(w.mv("d", "d"), Err(Error::AlreadyExists))); + assert!(matches!(w.rm("missing"), Err(Error::NotFound))); +} + +#[test] +fn writer_getters_and_writer_id() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + assert!(w.head_offset() > 0); + let seq_before = w.next_seq(); + w.set_writer_id(b"custom-agent"); + w.put_file("a.txt", b"x").unwrap(); + assert_eq!(w.next_seq(), seq_before + 1); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + let sessions = r.list_sessions().unwrap(); + assert!(sessions.iter().any(|s| s.writer == b"custom-agent")); + // node_view "as of" an early session sees no a.txt yet. + let view = r.node_view(Some(1)).unwrap(); + assert!(view.current.contains_key(&ROOT_NODE_ID)); + let _ = r.into_storage(); +} + +#[test] +fn error_display_is_human_readable() { + for e in [ + Error::MalformedNode("x"), + Error::MalformedSession("x"), + Error::BrokenChain("x"), + Error::ChainHashMismatch, + Error::DuplicateNodeInSession, + Error::ParentCycle, + Error::MissingContent, + Error::ContentHashMismatch, + Error::MissingBase, + Error::UnsupportedPatchAlgo(9), + Error::DeltaTooDeep, + Error::Vcdiff("boom".into()), + Error::NotFound, + Error::NotADirectory, + Error::AlreadyExists, + Error::InvalidPath("x"), + ] { + assert!(!format!("{e}").is_empty()); + } +} + +// ---- compression (Section 9.4) ------------------------------------------ + +#[test] +fn compressible_content_is_stored_smaller_and_reconstructs() { + let payload = b"PFS-MS compresses repetitive data well. ".repeat(64); + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("big.txt", &payload).unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("big.txt").unwrap(), payload); + + // The stored RAW content partition must be smaller than the raw payload. + let scan = r.scan().unwrap(); + let raw = scan + .uid_index + .values() + .find(|e| e.partition_type == pfs_ms::RAW_TYPE && e.used_bytes > 0) + .unwrap(); + assert!( + (raw.used_bytes as usize) < payload.len(), + "stored {} vs raw {}", + raw.used_bytes, + payload.len() + ); +} + +#[test] +fn compression_can_be_disabled() { + let payload = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".repeat(20); // very compressible + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.set_compression(false); + w.put_file("v.txt", &payload).unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("v.txt").unwrap(), payload); + // With compression disabled the content is stored verbatim. + let scan = r.scan().unwrap(); + let raw = scan + .uid_index + .values() + .find(|e| e.partition_type == pfs_ms::RAW_TYPE && e.used_bytes > 0) + .unwrap(); + assert_eq!(raw.used_bytes as usize, payload.len()); +} + +#[test] +fn unknown_compression_algo_is_reported_on_read() { + // Forge a DIRECT record claiming an unimplemented compression_algo_id. + let mut w = FsWriter::create(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let rec = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: id(5), + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"f".to_vec(), + content: Some(ContentSection::Direct { + compression_algo: 9, // unimplemented + content_uid: id(0xC0), + full_size: 5, + full_hash_algo: HashAlgo::Sha256, + full_hash: HashAlgo::Sha256.compute(b"hello"), + }), + }; + w.commit( + vec![ + Partition::raw(id(0xC0), "c", b"hello".to_vec()), + Partition::node(id(0x01), &rec), + ], + id(0x31), + 1, + 0, + b"", + ) + .unwrap(); + let bytes = w.into_storage().into_inner(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + assert!(matches!( + r.read_path("f"), + Err(Error::UnsupportedCompressionAlgo(9)) + )); +} + +#[test] +fn content_corruption_is_detected() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("secret.txt", b"important payload").unwrap(); + let mut bytes = w.into_storage().into_inner(); + + // Locate the content RAW partition and flip a byte in its data region. + let start = { + let mut r = FsReader::open(Cursor::new(bytes.clone())).unwrap(); + let scan = r.scan().unwrap(); + let entry = scan + .uid_index + .values() + .find(|e| e.partition_type == pfs_ms::RAW_TYPE && e.used_bytes > 0) + .unwrap(); + entry.start_offset as usize + }; + bytes[start] ^= 0xFF; + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + assert!(r.verify().is_err()); +} diff --git a/reference/PFS-MS-v1.0/tests/dirsync.rs b/reference/PFS-MS-v1.0/tests/dirsync.rs new file mode 100644 index 0000000..eeda2b4 --- /dev/null +++ b/reference/PFS-MS-v1.0/tests/dirsync.rs @@ -0,0 +1,229 @@ +//! End-to-end tests for the directory <-> archive tooling. + +use std::collections::BTreeMap; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; + +use pfs_ms::{create_archive, extract_archive, update_archive, FsReader, SyncOptions}; + +static COUNTER: AtomicU64 = AtomicU64::new(0); + +/// A unique temporary directory, removed on drop (no external dev-dependency). +struct TempDir(PathBuf); + +impl TempDir { + fn new() -> Self { + let n = COUNTER.fetch_add(1, Ordering::Relaxed); + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); + let p = std::env::temp_dir().join(format!("pfs-test-{}-{n}-{nanos}", std::process::id())); + fs::create_dir_all(&p).unwrap(); + TempDir(p) + } + fn path(&self) -> &Path { + &self.0 + } +} + +impl Drop for TempDir { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.0); + } +} + +/// Collect (relative path -> file content) and the set of directory paths. +fn snapshot_dir(root: &Path) -> (BTreeMap>, Vec) { + let mut files = BTreeMap::new(); + let mut dirs = Vec::new(); + fn walk( + dir: &Path, + prefix: &str, + files: &mut BTreeMap>, + dirs: &mut Vec, + ) { + let mut entries: Vec<_> = fs::read_dir(dir).unwrap().map(|e| e.unwrap()).collect(); + entries.sort_by_key(|e| e.file_name()); + for e in entries { + let name = e.file_name().to_string_lossy().into_owned(); + let rel = if prefix.is_empty() { + name + } else { + format!("{prefix}/{name}") + }; + let ft = e.file_type().unwrap(); + if ft.is_dir() { + dirs.push(rel.clone()); + walk(&e.path(), &rel, files, dirs); + } else if ft.is_file() { + files.insert(rel, fs::read(e.path()).unwrap()); + } + } + } + walk(root, "", &mut files, &mut dirs); + dirs.sort(); + (files, dirs) +} + +fn write(path: &Path, content: &[u8]) { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).unwrap(); + } + fs::File::create(path).unwrap().write_all(content).unwrap(); +} + +fn head_seq(archive: &Path) -> u64 { + let f = fs::OpenOptions::new() + .read(true) + .write(true) + .open(archive) + .unwrap(); + let mut r = FsReader::open(f).unwrap(); + r.list_sessions() + .unwrap() + .iter() + .map(|s| s.session_seq) + .max() + .unwrap() +} + +#[test] +fn create_then_extract_roundtrips() { + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + let archive = tmp.path().join("a.pfs"); + let out = tmp.path().join("out"); + + write(&src.join("readme.md"), b"top-level\n"); + write(&src.join("docs/guide.txt"), b"a guide\n"); + write(&src.join("docs/deep/nested.bin"), &[0u8, 1, 2, 3, 4, 5]); + fs::create_dir_all(src.join("empty")).unwrap(); // an empty directory + + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + extract_archive(&archive, &out, None, true).unwrap(); + + let (sf, sd) = snapshot_dir(&src); + let (of, od) = snapshot_dir(&out); + assert_eq!(sf, of, "file set/content must match"); + assert_eq!(sd, od, "directory set must match (incl. the empty dir)"); +} + +#[test] +fn create_rejects_existing_archive() { + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + write(&src.join("f"), b"x"); + let archive = tmp.path().join("a.pfs"); + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + // Second create on the same path must fail. + assert!(create_archive(&archive, &src, &SyncOptions::default()).is_err()); +} + +#[test] +fn update_adds_and_modifies() { + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + let archive = tmp.path().join("a.pfs"); + let out = tmp.path().join("out"); + + write(&src.join("a.txt"), b"v1\n"); + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + + write(&src.join("a.txt"), b"v2 changed\n"); + write(&src.join("sub/b.txt"), b"new file\n"); + update_archive(&archive, &src, &SyncOptions::default()).unwrap(); + + extract_archive(&archive, &out, None, true).unwrap(); + assert_eq!(fs::read(out.join("a.txt")).unwrap(), b"v2 changed\n"); + assert_eq!(fs::read(out.join("sub/b.txt")).unwrap(), b"new file\n"); +} + +#[test] +fn update_with_delete_mirrors_removals() { + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + let archive = tmp.path().join("a.pfs"); + let out = tmp.path().join("out"); + + write(&src.join("keep.txt"), b"keep\n"); + write(&src.join("gone.txt"), b"gone\n"); + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + + fs::remove_file(src.join("gone.txt")).unwrap(); + let opts = SyncOptions { + delete: true, + ..SyncOptions::default() + }; + update_archive(&archive, &src, &opts).unwrap(); + + extract_archive(&archive, &out, None, true).unwrap(); + assert!(out.join("keep.txt").exists()); + assert!( + !out.join("gone.txt").exists(), + "mirror must remove deleted files" + ); +} + +#[test] +fn extract_point_in_time() { + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + let archive = tmp.path().join("a.pfs"); + let out_old = tmp.path().join("old"); + let out_new = tmp.path().join("new"); + + write(&src.join("a.txt"), b"original\n"); + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + let seq_after_create = head_seq(&archive); + + write(&src.join("a.txt"), b"updated\n"); + update_archive(&archive, &src, &SyncOptions::default()).unwrap(); + + extract_archive(&archive, &out_old, Some(seq_after_create), true).unwrap(); + extract_archive(&archive, &out_new, None, true).unwrap(); + assert_eq!(fs::read(out_old.join("a.txt")).unwrap(), b"original\n"); + assert_eq!(fs::read(out_new.join("a.txt")).unwrap(), b"updated\n"); +} + +#[test] +fn no_op_update_commits_no_session() { + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + let archive = tmp.path().join("a.pfs"); + write(&src.join("a.txt"), b"same\n"); + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + let before = head_seq(&archive); + // Re-running update with no changes must not add a session. + update_archive(&archive, &src, &SyncOptions::default()).unwrap(); + assert_eq!(head_seq(&archive), before); +} + +#[cfg(unix)] +#[test] +fn metadata_mode_is_preserved_and_skippable() { + use std::os::unix::fs::PermissionsExt; + let tmp = TempDir::new(); + let src = tmp.path().join("src"); + let archive = tmp.path().join("a.pfs"); + let out = tmp.path().join("out"); + let out_no = tmp.path().join("out_no"); + + write(&src.join("secret.txt"), b"x\n"); + fs::set_permissions(src.join("secret.txt"), fs::Permissions::from_mode(0o640)).unwrap(); + + create_archive(&archive, &src, &SyncOptions::default()).unwrap(); + extract_archive(&archive, &out, None, true).unwrap(); + let mode = fs::metadata(out.join("secret.txt")) + .unwrap() + .permissions() + .mode() + & 0o777; + assert_eq!(mode, 0o640); + + // With metadata restore disabled the bits are not forced. + extract_archive(&archive, &out_no, None, false).unwrap(); + assert_eq!(fs::read(out_no.join("secret.txt")).unwrap(), b"x\n"); +} diff --git a/reference/PFS-MS-v1.0/tests/roundtrip.rs b/reference/PFS-MS-v1.0/tests/roundtrip.rs new file mode 100644 index 0000000..d36f698 --- /dev/null +++ b/reference/PFS-MS-v1.0/tests/roundtrip.rs @@ -0,0 +1,225 @@ +//! End-to-end tests for the `pfs-ms` reference crate. + +use std::io::Cursor; + +use pcf::{Container, HashAlgo}; +use pfs_ms::{FsReader, FsWriter}; + +/// Build the Section 17 three-session scenario in memory and return the bytes. +fn build_spec_scenario() -> Vec { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + // Session 2: create docs/ and hello.txt v1. + w.mkdir("docs").unwrap(); + w.put_file("docs/hello.txt", b"Hello\n").unwrap(); + // Session: modify hello.txt to v2 (DELTA territory). + w.put_file("docs/hello.txt", b"Hello, world\n").unwrap(); + // Final: rename docs -> documents, delete hello.txt. + w.mv("docs", "documents").unwrap(); + w.rm("documents/hello.txt").unwrap(); + w.into_storage().into_inner() +} + +#[test] +fn spec_scenario_reconstructs_at_head() { + let bytes = build_spec_scenario(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + + let tree = r.tree().unwrap(); + // The root has exactly one live child: the renamed directory. + let root_kids: Vec = { + let root = pfs_ms::ROOT_NODE_ID; + tree.children[&root] + .iter() + .map(|id| tree.nodes[id].name_str()) + .collect() + }; + assert_eq!(root_kids, vec!["documents".to_string()]); + + // hello.txt is not live at the head. + assert!(r.read_path("documents/hello.txt").is_err()); +} + +#[test] +fn history_query_as_of_earlier_session() { + let bytes = build_spec_scenario(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + + // Find the session_seq at which hello.txt held "Hello, world\n". + // Sessions: 1 mkfs root, 2 mkdir docs, 3 put v1, 4 put v2, 5 mv, 6 rm. + let v2 = r.read_path_as_of("docs/hello.txt", Some(4)).unwrap(); + assert_eq!(v2, b"Hello, world\n"); + + let v1 = r.read_path_as_of("docs/hello.txt", Some(3)).unwrap(); + assert_eq!(v1, b"Hello\n"); +} + +#[test] +fn delta_and_direct_reconstruct_correctly() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let v1 = b"the quick brown fox jumps over the lazy dog\n".repeat(20); + let v2 = { + let mut s = v1.clone(); + s.extend_from_slice(b"...with a small appended change\n"); + s + }; + w.put_file("f.txt", &v1).unwrap(); + w.put_file("f.txt", &v2).unwrap(); // should pick DELTA (small patch) + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("f.txt").unwrap(), v2); +} + +#[test] +fn large_compressible_file_roundtrips() { + // A large, highly compressible payload exercises the DEFLATE content path + // end to end (write -> store compressed -> read -> decompress). + let payload = b"0123456789abcdef".repeat(4096); // 64 KiB, very compressible + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("data.bin", &payload).unwrap(); + // Modify it; the new version is also compressed (DIRECT or DELTA). + let mut payload2 = payload.clone(); + payload2.extend_from_slice(b"tail"); + w.put_file("data.bin", &payload2).unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("data.bin").unwrap(), payload2); +} + +#[test] +fn move_file_preserves_bytes_via_inherit() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.mkdir("a").unwrap(); + w.mkdir("b").unwrap(); + w.put_file("a/note.txt", b"keep me\n").unwrap(); + w.mv("a/note.txt", "b/renamed.txt").unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("b/renamed.txt").unwrap(), b"keep me\n"); + assert!(r.read_path("a/note.txt").is_err()); +} + +#[test] +fn directory_delete_is_recursive_by_ancestry() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.mkdir("d").unwrap(); + w.put_file("d/x.txt", b"x\n").unwrap(); + w.put_file("d/y.txt", b"y\n").unwrap(); + w.rm("d").unwrap(); // single tombstone removes the whole subtree + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + let tree = r.tree().unwrap(); + assert!(tree.children[&pfs_ms::ROOT_NODE_ID].is_empty()); + assert!(r.read_path("d/x.txt").is_err()); +} + +#[test] +fn resurrection_reuses_node_id_path() { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("z.txt", b"first\n").unwrap(); + w.rm("z.txt").unwrap(); + w.put_file("z.txt", b"second\n").unwrap(); // fresh node at same path + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("z.txt").unwrap(), b"second\n"); +} + +#[test] +fn reopen_and_append_more_sessions() { + let bytes = { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("a.txt", b"alpha\n").unwrap(); + w.into_storage().into_inner() + }; + // Reopen and append. + let bytes = { + let mut w = FsWriter::open(Cursor::new(bytes)).unwrap(); + w.put_file("b.txt", b"beta\n").unwrap(); + w.into_storage().into_inner() + }; + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("a.txt").unwrap(), b"alpha\n"); + assert_eq!(r.read_path("b.txt").unwrap(), b"beta\n"); +} + +#[test] +fn multi_block_session_spans_overflow_blocks() { + // A single session that introduces > 255 partitions must use several + // Table Blocks (Section 6.1). Build it via the low-level commit API. + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + + let root = pfs_ms::ROOT_NODE_ID; + let mut parts = Vec::new(); + for i in 0..600u32 { + let rec = pfs_ms::NodeRecord { + kind: pfs_ms::KIND_FILE, + flags: 0, + node_id: pfs_ms::new_id(), + parent_id: root, + mtime_unix_ms: 0, + mode: 0, + name: format!("file{i:04}.txt").into_bytes(), + content: Some(pfs_ms::ContentSection::Empty), + }; + parts.push(pfs_ms::Partition::node(pfs_ms::new_id(), &rec)); + } + w.commit(parts, pfs_ms::new_id(), 600, 0, b"bulk").unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); + let tree = r.tree().unwrap(); + assert_eq!(tree.children[&root].len(), 600); + + // The bulk session must report block_count >= 3 (1 HEAD + >= 2 members). + let sessions = r.list_sessions().unwrap(); + let bulk = sessions.iter().find(|s| s.writer == b"bulk").unwrap(); + assert!(bulk.block_count >= 3, "block_count = {}", bulk.block_count); +} + +#[test] +fn a_pfs_file_is_a_valid_pcf_file() { + // A generic PCF reader must enumerate every partition across all sessions + // as a flat, valid set, and verify every table_hash / data_hash. + let bytes = build_spec_scenario(); + let mut c = Container::open(Cursor::new(bytes)).unwrap(); + c.verify().unwrap(); + let entries = c.entries().unwrap(); + // At least: 6 sessions worth of PFS_SESSION + the node/content partitions. + let sessions = entries + .iter() + .filter(|e| e.partition_type == pfs_ms::PFS_SESSION_TYPE) + .count(); + assert_eq!(sessions, 6); + assert!(entries + .iter() + .any(|e| e.partition_type == pfs_ms::PFS_NODE_TYPE)); +} + +#[test] +fn crash_recovery_truncated_tail_is_invisible() { + // Bytes written by an interrupted session (after the committed head) are + // invisible to readers because the header still points at the old head. + let committed = { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("a.txt", b"committed\n").unwrap(); + w.into_storage().into_inner() + }; + let mut with_garbage = committed.clone(); + with_garbage.extend_from_slice(&[0xABu8; 500]); // simulate an aborted append + + let mut r = FsReader::open(Cursor::new(with_garbage)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_path("a.txt").unwrap(), b"committed\n"); +} diff --git a/reference/PFS-MS-v1.0/tests/spec_compliance.rs b/reference/PFS-MS-v1.0/tests/spec_compliance.rs new file mode 100644 index 0000000..100df55 --- /dev/null +++ b/reference/PFS-MS-v1.0/tests/spec_compliance.rs @@ -0,0 +1,279 @@ +//! One test (or a small group) per normative requirement in the PFS-MS +//! specification's conformance section (Section 13, R1..R8 / W1..W7) plus the +//! field-layout constants of Appendix A. + +use std::io::Cursor; + +use pcf::HashAlgo; +use pfs_ms::{ + build_reference_vector, ContentSection, Error, FsReader, FsWriter, NodeRecord, Partition, + KIND_DIR, KIND_FILE, NODE_PREFIX_LEN, ROOT_NODE_ID, +}; + +fn id(b: u8) -> [u8; 16] { + [b; 16] +} + +// ---- Appendix A: field layout constants --------------------------------- + +#[test] +fn appendix_a_layout_constants() { + assert_eq!(NODE_PREFIX_LEN, 54); + assert_eq!(pfs_ms::DIRECT_SECTION_LEN, 91); // includes compression_algo_id + assert_eq!(pfs_ms::DELTA_SECTION_LEN, 165); // includes compression_algo_id + assert_eq!(pfs_ms::SESSION_PREFIX_LEN, 162); + assert_eq!(pfs_ms::PFS_NODE_TYPE, 0xAAAA_0001); + assert_eq!(pfs_ms::PFS_SESSION_TYPE, 0xAAAA_0002); + assert_eq!(pfs_ms::RAW_TYPE, 0xFFFF_FFFF); + assert_eq!(pfs_ms::NODE_MAGIC, *b"PFSN"); + assert_eq!(pfs_ms::SESSION_MAGIC, *b"PFSS"); + assert_eq!(pfs_ms::ROOT_NODE_ID, [0u8; 16]); + assert_eq!(pfs_ms::COMPRESS_NONE, 0); + assert_eq!(pfs_ms::COMPRESS_DEFLATE, 1); +} + +// ---- R1: a conforming PFS reader is a conforming PCF reader -------------- + +#[test] +fn r1_rejects_non_pcf_input() { + let garbage = vec![0u8; 64]; + assert!(FsReader::open(Cursor::new(garbage)).is_err()); +} + +// ---- R2/R3: backward chain, strictly decreasing seq, one PFS_SESSION ----- + +#[test] +fn r2_r3_chain_is_backward_and_strictly_decreasing() { + let bytes = build_reference_vector().unwrap(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + let scan = r.scan().unwrap(); + // Sessions are newest-first and strictly decreasing. + let seqs: Vec = scan.sessions.iter().map(|s| s.seq).collect(); + assert_eq!(seqs, vec![3, 2, 1]); + // Each HEAD block holds exactly one PFS_SESSION entry. + for s in &scan.sessions { + // block_count is honoured (single-block sessions here). + assert_eq!(s.record.block_count, 1); + } +} + +#[test] +fn r3_head_block_offsets_decrease_toward_the_tail() { + // The backward link means each newer session's HEAD sits at a higher + // offset than the previous session's HEAD. + let bytes = build_reference_vector().unwrap(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + let scan = r.scan().unwrap(); + let offs: Vec = scan.sessions.iter().map(|s| s.head_offset).collect(); + assert!(offs.windows(2).all(|w| w[0] > w[1]), "offsets {offs:?}"); +} + +// ---- R4: malformed node records are rejected ---------------------------- + +#[test] +fn r4_malformed_node_is_rejected_on_read() { + // A node with a reserved flag bit must fail when the reader parses it. + let mut w = FsWriter::create(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let bad = NodeRecord { + kind: KIND_DIR, + flags: 0x0002, // reserved bit + node_id: id(1), + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"x".to_vec(), + content: None, + }; + // Build the record bytes directly so the reserved bit survives to disk. + w.commit(vec![Partition::node(id(0x01), &bad)], id(0x31), 1, 0, b"") + .unwrap(); + let bytes = w.into_storage().into_inner(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + assert!(matches!(r.scan(), Err(Error::MalformedNode(_)))); +} + +// ---- R5: newest wins; a node twice in one session is malformed ---------- + +#[test] +fn r5_duplicate_node_in_one_session_is_rejected() { + let mut w = FsWriter::create(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let dup = |name: &[u8]| NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id: id(7), + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: name.to_vec(), + content: None, + }; + w.commit( + vec![ + Partition::node(id(0x01), &dup(b"a")), + Partition::node(id(0x02), &dup(b"b")), + ], + id(0x31), + 2, + 0, + b"", + ) + .unwrap(); + let bytes = w.into_storage().into_inner(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + assert!(matches!(r.scan(), Err(Error::DuplicateNodeInSession))); +} + +// ---- R6: liveness walk rejects cycles ----------------------------------- + +#[test] +fn r6_parent_cycle_is_rejected() { + let mut w = FsWriter::create(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let a = id(0xA0); + let b = id(0xB0); + let dir = |node_id, parent_id, name: &[u8]| NodeRecord { + kind: KIND_DIR, + flags: 0, + node_id, + parent_id, + mtime_unix_ms: 0, + mode: 0, + name: name.to_vec(), + content: None, + }; + // A's parent is B and B's parent is A: an unreachable cycle. + w.commit( + vec![ + Partition::node(id(0x01), &dir(a, b, b"A")), + Partition::node(id(0x02), &dir(b, a, b"B")), + ], + id(0x31), + 2, + 0, + b"", + ) + .unwrap(); + let bytes = w.into_storage().into_inner(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + assert!(matches!(r.verify(), Err(Error::ParentCycle))); +} + +// ---- R7: content hashes are verified ------------------------------------ + +#[test] +fn r7_full_hash_mismatch_is_detected() { + // Forge a DIRECT record whose full_hash does not match its content. + let mut w = FsWriter::create(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let rec = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: id(5), + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: b"f".to_vec(), + content: Some(ContentSection::Direct { + compression_algo: pfs_ms::COMPRESS_NONE, + content_uid: id(0xC0), + full_size: 5, + full_hash_algo: HashAlgo::Sha256, + full_hash: HashAlgo::Sha256.compute(b"WRONG"), // deliberately wrong + }), + }; + w.commit( + vec![ + Partition::raw(id(0xC0), "c", b"right".to_vec()), + Partition::node(id(0x01), &rec), + ], + id(0x31), + 1, + 0, + b"", + ) + .unwrap(); + let bytes = w.into_storage().into_inner(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + assert!(matches!(r.read_path("f"), Err(Error::ContentHashMismatch))); +} + +// ---- R8: the inter-session hash chain verifies on a good file ----------- + +#[test] +fn r8_inter_session_chain_verifies() { + let bytes = build_reference_vector().unwrap(); + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + // verify() includes verify_chain(); it must succeed on a well-formed file. + r.verify().unwrap(); +} + +// ---- W2: only the 8-byte header pointer changes across a commit ---------- + +#[test] +fn w2_commit_only_rewrites_the_header_pointer() { + let f1 = { + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + w.put_file("a.txt", b"alpha\n").unwrap(); + w.into_storage().into_inner() + }; + let len1 = f1.len(); + let f2 = { + let mut w = FsWriter::open(Cursor::new(f1.clone())).unwrap(); + w.put_file("b.txt", b"beta\n").unwrap(); + w.into_storage().into_inner() + }; + // The new session only appends; the previous bytes are immutable except + // for the 8-byte partition_table_offset at header offset 12. + assert!(f2.len() > len1); + assert_eq!(&f2[0..12], &f1[0..12]); // magic + version unchanged + assert_eq!(&f2[20..len1], &f1[20..len1]); // all prior bytes immutable + assert_ne!(&f2[12..20], &f1[12..20]); // head pointer advanced +} + +// ---- W3: HEAD carries the session, MEMBER blocks do not ------------------ + +#[test] +fn w3_member_blocks_carry_no_session_record() { + // A >255-entry session uses overflow MEMBER blocks; only the HEAD block may + // hold the PFS_SESSION partition. + let mut w = FsWriter::mkfs(Cursor::new(Vec::new()), HashAlgo::Sha256).unwrap(); + let mut parts = Vec::new(); + for i in 0..300u32 { + let rec = NodeRecord { + kind: KIND_FILE, + flags: 0, + node_id: pfs_ms::new_id(), + parent_id: ROOT_NODE_ID, + mtime_unix_ms: 0, + mode: 0, + name: format!("f{i}").into_bytes(), + content: Some(ContentSection::Empty), + }; + parts.push(Partition::node(pfs_ms::new_id(), &rec)); + } + w.commit(parts, pfs_ms::new_id(), 300, 0, b"bulk").unwrap(); + let bytes = w.into_storage().into_inner(); + + let mut r = FsReader::open(Cursor::new(bytes)).unwrap(); + r.verify().unwrap(); // scan enforces "MEMBER block contains no PFS_SESSION" + let scan = r.scan().unwrap(); + let bulk = scan + .sessions + .iter() + .find(|s| s.record.writer == b"bulk") + .unwrap(); + assert!(bulk.record.block_count >= 2); + assert_eq!(bulk.nodes.len(), 300); +} + +// ---- byte-exact reference vector (Section 17) ---------------------------- + +#[test] +fn reference_vector_is_byte_exact() { + let bytes = build_reference_vector().unwrap(); + assert_eq!(bytes.len(), 2986, "reference vector length changed"); + let digest = HashAlgo::Sha256.compute(&bytes); + let hex: String = digest[..32].iter().map(|b| format!("{b:02x}")).collect(); + assert_eq!( + hex, "79b6dd7093172b4fe33d57a5ca53994c387dd3149021ef4fcb2b8a3fea7429bc", + "reference vector bytes changed" + ); +} diff --git a/specs/PFS-MS-spec-v1.0.txt b/specs/PFS-MS-spec-v1.0.txt index fbcc673..d788fca 100644 --- a/specs/PFS-MS-spec-v1.0.txt +++ b/specs/PFS-MS-spec-v1.0.txt @@ -19,6 +19,12 @@ Status of This Document The profile version described here is major version 1, minor version 0. + This pre-publication revision adds OPTIONAL per-file content compression + (Section 9.5): the DIRECT and DELTA content sections each carry a + compression_algo_id and are one byte longer than in earlier drafts. The + change is intentionally NOT compatible with files written by earlier drafts; + as the profile is unpublished, the version stays 1.0. + ------------------------------------------------------------------------------- Table of Contents @@ -51,6 +57,7 @@ Table of Contents 9.2 Delta Encoding and the Delta Algorithm Registry 9.3 Reconstruction 9.4 Delta Depth and Re-Baselining + 9.5 Compression and the Compression Algorithm Registry 10. Filesystem Semantics 10.1 Node Identity and the Root 10.2 Liveness @@ -84,7 +91,8 @@ Table of Contents - File CONTENT lives in PCF RAW partitions (type 0xFFFFFFFF). A RAW partition holds either the full bytes of a file or a binary patch - (delta) against a previous version. + (delta) against a previous version, in either case OPTIONALLY + compressed (Section 9.5). - NODE metadata lives in PFS_NODE partitions (type 0xAAAA0001). One PFS_NODE partition describes the complete current state of exactly @@ -450,34 +458,38 @@ Table of Contents Used for renames/moves that do not change bytes. 1 DIRECT - The full content lives in one RAW partition. + The full content lives in one RAW partition, optionally compressed + (Section 9.5). - +1 16 bytes content_uid (PCF uid of the RAW partition) - +17 8 u64 full_size (length of the content) - +25 1 u8 full_hash_algo_id (PCF hash registry id) - +26 64 bytes full_hash (hash of the full content) - section length = 90 bytes + +1 1 u8 compression_algo_id (Section 9.5) + +2 16 bytes content_uid (PCF uid of the RAW partition) + +18 8 u64 full_size (length of the DECOMPRESSED content) + +26 1 u8 full_hash_algo_id (PCF hash registry id) + +27 64 bytes full_hash (hash of the full content) + section length = 91 bytes 2 DELTA The content is the result of applying a patch to the previous - content-bearing version of this node. + content-bearing version of this node. The patch RAW bytes are + optionally compressed (Section 9.5). +1 1 u8 patch_algo_id (Section 9.2) - +2 16 bytes patch_uid (PCF uid of the patch RAW part.) - +18 8 u64 full_size (length of reconstructed content) - +26 1 u8 full_hash_algo_id - +27 64 bytes full_hash (hash of reconstructed content) - +91 8 u64 base_full_size (length of the base) - +99 1 u8 base_full_hash_algo_id - +100 64 bytes base_full_hash (hash of the base; see 9.3) - section length = 164 bytes + +2 1 u8 compression_algo_id (Section 9.5; applies to patch) + +3 16 bytes patch_uid (PCF uid of the patch RAW part.) + +19 8 u64 full_size (length of reconstructed content) + +27 1 u8 full_hash_algo_id + +28 64 bytes full_hash (hash of reconstructed content) + +92 8 u64 base_full_size (length of the base) + +100 1 u8 base_full_hash_algo_id + +101 64 bytes base_full_hash (hash of the base; see 9.3) + section length = 165 bytes full_hash and base_full_hash use the PCF Hash Algorithm Registry (PCF Section 8.1) and the PCF hash field encoding (left-aligned, zero-padded to the fixed width; here the field is 64 bytes, PCF Section 8.2). full_hash - commits to the FULL reconstructed content, not to the stored RAW bytes; - the stored RAW bytes are independently protected by the partition's PCF - data_hash. + commits to the FULL reconstructed content (after decompression), not to the + stored RAW bytes; the stored RAW bytes (compressed, if any) are + independently protected by the partition's PCF data_hash. ------------------------------------------------------------------------------- @@ -597,6 +609,11 @@ Table of Contents version), or INHERIT (same bytes as the previous version). A directory has no content. A tombstoned node has no content. + For DIRECT and DELTA, the bytes physically stored in the RAW partition (the + full content, or the patch) MAY be COMPRESSED; the content section names the + compression algorithm in compression_algo_id (Section 9.5). EMPTY and + INHERIT store no RAW bytes and are never compressed. + 9.2 Delta Encoding and the Delta Algorithm Registry A DELTA content section names a patch algorithm by patch_algo_id: @@ -624,14 +641,16 @@ Table of Contents record V[k].content_kind: EMPTY -> the empty byte string DIRECT -> read the RAW partition V[k].content_uid; verify its PCF - data_hash; the bytes are the content; verify that - hash(bytes) == V[k].full_hash and len == full_size + data_hash; let content = decompress(compression_algo_id, + stored bytes) (Section 9.5); verify that + hash(content) == V[k].full_hash and len == full_size INHERIT -> materialize(k+1) DELTA -> base = materialize(k+1) verify hash(base) == V[k].base_full_hash and len(base) == V[k].base_full_size read the RAW partition V[k].patch_uid; verify its PCF data_hash + patch = decompress(compression_algo_id, stored bytes) bytes = apply(patch_algo_id, base, patch) verify hash(bytes) == V[k].full_hash and len == full_size @@ -662,6 +681,41 @@ Table of Contents Reader MAY refuse to reconstruct, reporting the file as too deeply chained rather than malformed. +9.5 Compression and the Compression Algorithm Registry + + The bytes stored in a DIRECT content RAW partition (the full content) or a + DELTA patch RAW partition (the patch) MAY be compressed before storage. The + content section's compression_algo_id names the algorithm: + + ID Algorithm Notes + --- ---------- --------------------------------------------------- + 0 none Stored verbatim (REQUIRED). + 1 DEFLATE RFC 1951 (the default and REQUIRED compression algo). + 2 zstd Reserved for a future revision. + 3 brotli Reserved for a future revision. + --- ---------- --------------------------------------------------- + + A conforming PFS-MS implementation MUST support compression_algo_id = 0 + (none) and = 1 (DEFLATE, RFC 1951). Identifiers 2.. are reserved for future + non-conflicting additions; they do not change any byte layout. A Reader + encountering a compression_algo_id it does not implement MUST treat the + affected file as unreadable but MUST NOT treat the file or the container as + malformed on that basis alone (the same rule as for patch_algo_id, + Section 9.2). + + Layering of integrity is unchanged in spirit (Section 7.3): the partition's + PCF data_hash protects the STORED (possibly compressed) bytes, while + full_hash (and, for DELTA, the result of applying the decompressed patch) + protects the reconstructed content. full_size is the length of the + reconstructed (decompressed) content; the stored length is the RAW + partition's used_bytes. + + compression and delta compose: for DELTA the patch is first decompressed, + then applied to the base. A Writer SHOULD store the compressed form only + when it is smaller than the verbatim bytes; otherwise it SHOULD use + compression_algo_id = 0. Choosing DELTA vs DIRECT (Section 9.4) is decided + on the post-compression stored sizes the Writer intends to emit. + ------------------------------------------------------------------------------- 10. Filesystem Semantics @@ -1016,11 +1070,14 @@ Table of Contents implementation and pinned as a byte-exact vector, mirroring PCF Section 15). Session 1 (session_seq = 1): - RAW r1 : full bytes of "hello.txt" v1 ("Hello\n") + RAW r1 : bytes of "hello.txt" v1, stored either verbatim + (compression_algo_id=0) or DEFLATE-compressed + (compression_algo_id=1) when smaller PFS_NODE: root (kind=dir, node_id=0, name="") PFS_NODE: dir "docs" (kind=dir, parent=root) PFS_NODE: file "hello.txt" (kind=file, parent=docs, - content DIRECT -> r1, full_size=6) + content DIRECT -> r1, compression_algo_id as above, + full_size = decompressed length) PFS_SESSION: seq=1, prev_session_hash = zero, prev algo id = 0, block_count = 1, member_blocks_digest = zero (algo 0) HEAD block T1: 5 entries, next_table_offset = 0, table_hash = H1 @@ -1083,20 +1140,22 @@ Appendix A. Field Layout Summary content_section +0 1 u8 content_kind (0=EMPTY, 1=DIRECT, 2=DELTA, 3=INHERIT) - DIRECT (kind=1), total 90 bytes: - +1 16 bytes content_uid - +17 8 u64 full_size - +25 1 u8 full_hash_algo_id - +26 64 bytes full_hash - DELTA (kind=2), total 164 bytes: - +1 1 u8 patch_algo_id (1 = VCDIFF) - +2 16 bytes patch_uid - +18 8 u64 full_size + DIRECT (kind=1), total 91 bytes: + +1 1 u8 compression_algo_id (0=none, 1=DEFLATE) + +2 16 bytes content_uid + +18 8 u64 full_size (decompressed length) +26 1 u8 full_hash_algo_id +27 64 bytes full_hash - +91 8 u64 base_full_size - +99 1 u8 base_full_hash_algo_id - +100 64 bytes base_full_hash + DELTA (kind=2), total 165 bytes: + +1 1 u8 patch_algo_id (1 = VCDIFF) + +2 1 u8 compression_algo_id (0=none, 1=DEFLATE; of patch) + +3 16 bytes patch_uid + +19 8 u64 full_size + +27 1 u8 full_hash_algo_id + +28 64 bytes full_hash + +92 8 u64 base_full_size + +100 1 u8 base_full_hash_algo_id + +101 64 bytes base_full_hash EMPTY (kind=0) / INHERIT (kind=3): no further bytes PFS_SESSION record (PCF type 0xAAAA0002) -- partition data, in HEAD block @@ -1144,6 +1203,8 @@ Appendix B. Type and Constant Registry Node flags bit 0 = TOMBSTONE (others reserved, MUST be 0) content_kind 0 = EMPTY, 1 = DIRECT, 2 = DELTA, 3 = INHERIT patch_algo_id 1 = VCDIFF (RFC 3284, required), 2 = xdelta (reserved) + compression_algo_id 0 = none (required), 1 = DEFLATE (RFC 1951, required), + 2 = zstd (reserved), 3 = brotli (reserved) Limits PFS_MAX_NAME 1024 bytes (UTF-8) diff --git a/tools/pcf-debug/src/plugin/pfs.rs b/tools/pcf-debug/src/plugin/pfs.rs index 08fb7e7..ba35846 100644 --- a/tools/pcf-debug/src/plugin/pfs.rs +++ b/tools/pcf-debug/src/plugin/pfs.rs @@ -56,6 +56,26 @@ fn hash_pair( node } +/// Render a `compression_algo_id` byte as a labelled enum field (Section 9.5). +fn compression_field(data: &[u8], off: usize) -> FieldNode { + let id = data.get(off).copied().unwrap_or(0); + let name = match id { + 0 => "none", + 1 => "DEFLATE", + 2 => "zstd", + 3 => "brotli", + _ => "reserved", + }; + FieldNode::leaf( + "compression_algo_id", + FieldValue::Enum { + raw: id as u64, + name: name.into(), + }, + (off as u64, off as u64 + 1), + ) +} + // --------------------------------------------------------------------------- // PFS_NODE // --------------------------------------------------------------------------- @@ -239,25 +259,26 @@ fn decode_content(data: &[u8], s: usize, warnings: &mut Vec) -> FieldNod match content_kind { 0 | 3 => {} // EMPTY / INHERIT: no further bytes. 1 => { - // DIRECT, 90 bytes total. - if let Some(uid) = uid_at(data, s + 1) { + // DIRECT, 91 bytes total (Section 7.3). + content.push(compression_field(data, s + 1)); + if let Some(uid) = uid_at(data, s + 2) { content.push(FieldNode::leaf( "content_uid", FieldValue::Uid(uid), - (s as u64 + 1, s as u64 + 17), + (s as u64 + 2, s as u64 + 18), )); } - let full_size = le_u64(data, s + 17).unwrap_or(0); + let full_size = le_u64(data, s + 18).unwrap_or(0); content.push(FieldNode::leaf( "full_size", FieldValue::U64(full_size), - (s as u64 + 17, s as u64 + 25), + (s as u64 + 18, s as u64 + 26), )); - content.push(hash_pair("full_hash", data, s + 25, s + 26, warnings)); - check_trailing(data, s + 90, warnings); + content.push(hash_pair("full_hash", data, s + 26, s + 27, warnings)); + check_trailing(data, s + 91, warnings); } 2 => { - // DELTA, 164 bytes total. + // DELTA, 165 bytes total (Section 7.3). let patch_algo = data.get(s + 1).copied().unwrap_or(0); let patch_name = if patch_algo == 1 { "VCDIFF" @@ -272,28 +293,35 @@ fn decode_content(data: &[u8], s: usize, warnings: &mut Vec) -> FieldNod }, (s as u64 + 1, s as u64 + 2), )); - if let Some(uid) = uid_at(data, s + 2) { + content.push(compression_field(data, s + 2)); + if let Some(uid) = uid_at(data, s + 3) { content.push(FieldNode::leaf( "patch_uid", FieldValue::Uid(uid), - (s as u64 + 2, s as u64 + 18), + (s as u64 + 3, s as u64 + 19), )); } - let full_size = le_u64(data, s + 18).unwrap_or(0); + let full_size = le_u64(data, s + 19).unwrap_or(0); content.push(FieldNode::leaf( "full_size", FieldValue::U64(full_size), - (s as u64 + 18, s as u64 + 26), + (s as u64 + 19, s as u64 + 27), )); - content.push(hash_pair("full_hash", data, s + 26, s + 27, warnings)); - let base_size = le_u64(data, s + 91).unwrap_or(0); + content.push(hash_pair("full_hash", data, s + 27, s + 28, warnings)); + let base_size = le_u64(data, s + 92).unwrap_or(0); content.push(FieldNode::leaf( "base_full_size", FieldValue::U64(base_size), - (s as u64 + 91, s as u64 + 99), + (s as u64 + 92, s as u64 + 100), + )); + content.push(hash_pair( + "base_full_hash", + data, + s + 100, + s + 101, + warnings, )); - content.push(hash_pair("base_full_hash", data, s + 99, s + 100, warnings)); - check_trailing(data, s + 164, warnings); + check_trailing(data, s + 165, warnings); } _ => {} } diff --git a/tools/pcf-debug/tests/common/mod.rs b/tools/pcf-debug/tests/common/mod.rs index 03bfdc3..0a2545c 100644 --- a/tools/pcf-debug/tests/common/mod.rs +++ b/tools/pcf-debug/tests/common/mod.rs @@ -58,8 +58,9 @@ pub fn pfs_node_direct(name: &str) -> Vec { r.extend_from_slice(&0o644u32.to_le_bytes()); // mode r.extend_from_slice(&(name.len() as u16).to_le_bytes()); // name_len r.extend_from_slice(name.as_bytes()); // name - // content section (DIRECT, 90 bytes) + // content section (DIRECT, 91 bytes) r.push(1); // content_kind = DIRECT + r.push(1); // compression_algo_id = DEFLATE r.extend_from_slice(&[0xEE; 16]); // content_uid r.extend_from_slice(&42u64.to_le_bytes()); // full_size r.push(16); // full_hash_algo_id = SHA-256 diff --git a/tools/pcf-debug/tests/decode_pfs.rs b/tools/pcf-debug/tests/decode_pfs.rs index 252ce3c..37e4c6f 100644 --- a/tools/pcf-debug/tests/decode_pfs.rs +++ b/tools/pcf-debug/tests/decode_pfs.rs @@ -69,6 +69,16 @@ fn node_direct_fields_and_ranges() { } ); + let comp = find(&d.fields, "compression_algo_id").unwrap(); + assert_eq!( + comp.value, + FieldValue::Enum { + raw: 1, + name: "DEFLATE".into() + } + ); + assert_eq!(comp.range, Some((54 + 9 + 1, 54 + 9 + 2))); + let full_size = find(&d.fields, "full_size").unwrap(); assert_eq!(full_size.value, FieldValue::U64(42)); }