simstudioai
diff --git a/‎apps/sim/app/workspace/[workspaceId]/files/components/file-viewer/rich-markdown-editor/markdown-parse.fuzz.test.ts‎
Lines changed: 93 additions & 0 deletions b/‎apps/sim/app/workspace/[workspaceId]/files/components/file-viewer/rich-markdown-editor/markdown-parse.fuzz.test.ts‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎apps/sim/app/workspace/[workspaceId]/files/components/file-viewer/rich-markdown-editor/markdown-parse.test.ts‎
Lines changed: 143 additions & 0 deletions b/‎apps/sim/app/workspace/[workspaceId]/files/components/file-viewer/rich-markdown-editor/markdown-parse.test.ts‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎apps/sim/app/workspace/[workspaceId]/files/components/file-viewer/rich-markdown-editor/markdown-parse.ts‎
Lines changed: 133 additions & 0 deletions b/‎apps/sim/app/workspace/[workspaceId]/files/components/file-viewer/rich-markdown-editor/markdown-parse.ts‎
Lines changed: 133 additions & 0 deletions
@@ -0,0 +1,93 @@
+/**
+ * @vitest-environment jsdom
+ *
+ * Property test: for any document assembled from a palette of block constructs, the chunked parse
+ * must round-trip byte-identically to the whole-document parse, and be idempotent. This fuzzes the
+ * block splitter across thousands of randomized block combinations — the structures that a naive
+ * splitter shatters (loose lists, nested lists, multi-paragraph items, blockquotes) appear adjacent
+ * in every permutation — so a boundary bug surfaces here rather than on a user's file.
+ */
+import { Editor } from '@tiptap/core'
+import { afterEach, describe, expect, it } from 'vitest'
+import { createMarkdownContentExtensions } from './extensions'
+import { serializeMarkdownBody } from './markdown-parse'
+import { isRoundTripSafe } from './round-trip-safety'
+
+let editor: Editor | null = null
+afterEach(() => {
+  editor?.destroy()
+  editor = null
+})
+
+function oneShot(body: string): string {
+  editor = new Editor({ extensions: createMarkdownContentExtensions() })
+  editor.commands.setContent(body, { contentType: 'markdown' })
+  const out = editor.getMarkdown()
+  editor.destroy()
+  editor = null
+  return out
+}
+
+/** Deterministic PRNG (mulberry32) so a failure is always reproducible from its seed. */
+function rng(seed: number) {
+  return () => {
+    seed |= 0
+    seed = (seed + 0x6d2b79f5) | 0
+    let t = Math.imul(seed ^ (seed >>> 15), 1 | seed)
+    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
+  }
+}
+
+const BLOCKS: Array<(r: () => number) => string> = [
+  () => '# Heading one',
+  () => '### Heading three',
+  (r) =>
+    `A paragraph with **bold**, *italic*, \`code\`, and a [link](https://x.com/${Math.floor(r() * 99)}).`,
+  () => '- tight a\n- tight b\n- tight c',
+  () => '- loose a\n\n- loose b\n\n- loose c',
+  () => '1. ordered one\n2. ordered two\n3. ordered three',
+  () => '1. First\n   - sub bullet\n   - another\n     1. deep ordered\n     2. item\n2. Second',
+  () => '1. item\n\n   a second paragraph inside the item\n\n2. next item',
+  () => '> a blockquote\n> spanning lines\n>\n> > and a nested one',
+  () => '| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |',
+  () => '```ts\nconst x = 1\n\nfunction f() {\n  return x\n}\n```',
+  () => '- [ ] todo one\n- [x] done two\n  - [ ] subtask',
+  () => '---',
+  () => '![alt](https://img.example/a.png)',
+  () => '[![badge](https://img.shields.io/x.svg)](https://link.example)',
+  () => 'Text with ~~strikethrough~~ and a soft  \nline break inside it.',
+  // Raw HTML / reference defs: these route to the whole-document fallback, so fidelity must still
+  // hold even though the doc itself opens read-only (idempotency is only asserted for editable docs).
+  () => '<div class="note">\n\nwrapped content\n\n</div>',
+  () => 'See [the docs][ref].\n\n[ref]: https://example.com/docs',
+]
+
+function buildDoc(seed: number): string {
+  const r = rng(seed)
+  const count = 2 + Math.floor(r() * 8)
+  const parts: string[] = []
+  for (let i = 0; i < count; i++) parts.push(BLOCKS[Math.floor(r() * BLOCKS.length)](r))
+  return parts.join('\n\n')
+}
+
+describe('markdown chunked-parse property test', () => {
+  it('chunked === one-shot for every randomized document, and idempotent for every editable one', () => {
+    const failures: Array<{ seed: number; kind: string }> = []
+    for (let seed = 1; seed <= 400; seed++) {
+      const body = buildDoc(seed)
+      const chunked = serializeMarkdownBody(body)
+      // Fidelity is the load-bearing invariant: the chunked parse must never diverge from the
+      // whole-document parse, for ANY input. This is what guarantees no behavioral change.
+      if (chunked !== oneShot(body)) failures.push({ seed, kind: 'fidelity' })
+      // Idempotency (a re-parse changing nothing) only needs to hold where the doc is editable —
+      // read-only docs are never re-serialized through the editor, and a few constructs (raw HTML)
+      // are non-idempotent in the underlying editor regardless of chunking, which is exactly why
+      // they open read-only.
+      else if (isRoundTripSafe(body) && serializeMarkdownBody(chunked) !== chunked) {
+        failures.push({ seed, kind: 'idempotency' })
+      }
+    }
+    expect(failures).toEqual([])
+  })
+})
@@ -0,0 +1,143 @@
+/**
+ * @vitest-environment jsdom
+ */
+import { Editor } from '@tiptap/core'
+import { afterEach, describe, expect, it } from 'vitest'
+import { createMarkdownContentExtensions } from './extensions'
+import { parseMarkdownToDoc, serializeMarkdownBody, splitMarkdownBlocks } from './markdown-parse'
+
+let editor: Editor | null = null
+afterEach(() => {
+  editor?.destroy()
+  editor = null
+})
+
+/** The current whole-document path: parse markdown in one shot, serialize back. */
+function oneShot(body: string): string {
+  editor = new Editor({ extensions: createMarkdownContentExtensions() })
+  editor.commands.setContent(body, { contentType: 'markdown' })
+  const out = editor.getMarkdown()
+  editor.destroy()
+  editor = null
+  return out
+}
+
+/**
+ * Chunked parsing must be byte-identical to the one-shot path — these are the structures a naive
+ * blank-line split would shatter (loose lists span blank lines, list items hold multiple paragraphs,
+ * blockquotes and fenced code contain blank lines), so they're the real fidelity test.
+ */
+const CASES: Array<[string, string]> = [
+  [
+    'heading + inline marks',
+    '# Heading\n\nA paragraph with **bold**, *italic*, `code`, and a [link](https://x.com).',
+  ],
+  ['tight list', '- tight a\n- tight b\n- tight c'],
+  ['loose list (blank lines between items)', '- loose a\n\n- loose b\n\n- loose c'],
+  ['multi-paragraph list item', '1. first\n\n   second paragraph in item one\n\n2. second item'],
+  ['nested list', '- outer\n  - nested one\n  - nested two\n    - deeper\n- outer two'],
+  [
+    'nested blockquote with blank lines',
+    '> a blockquote\n>\n> with two paragraphs\n>\n> > and a nested quote',
+  ],
+  ['gfm table', '| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |'],
+  [
+    'fenced code with internal blank line',
+    '```ts\nconst x = 1\n\nfunction f() {\n  return x\n}\n```',
+  ],
+  ['task list', '- [ ] task one\n- [x] task two done\n  - [ ] subtask'],
+  ['thematic break between paragraphs', 'Para before.\n\n---\n\nPara after a divider.'],
+  [
+    'image + linked-image badge',
+    '![alt](https://img.example/a.png)\n\n[![badge](https://img.shields.io/x.svg)](https://link.example)',
+  ],
+  // The editor serializes nested lists with sub-3-space indentation, which a strict external lexer
+  // would mis-nest — this is the exact case that must survive re-parsing (idempotency).
+  [
+    'reduced-indent nested list (editor output shape)',
+    '1. First\n  - sub bullet\n  - another\n  1. deep ordered\n  2. item\n2. Second',
+  ],
+  ['heading-separated sections', '# A\n\nalpha\n\n## B\n\nbeta\n\n## C\n\ngamma'],
+]
+
+describe('parseMarkdownToDoc (chunked)', () => {
+  it('produces a doc node', () => {
+    const doc = parseMarkdownToDoc('# Hi\n\nbody')
+    expect(doc.type).toBe('doc')
+    expect(Array.isArray(doc.content)).toBe(true)
+  })
+
+  it.each(CASES)('chunked parse round-trips identically to one-shot: %s', (_label, body) => {
+    expect(serializeMarkdownBody(body)).toBe(oneShot(body))
+  })
+
+  // The editor re-parses its own output on every settle/repeat-stream, so a second pass must not
+  // drift — otherwise editing + saving + reopening would slowly corrupt structure (this is the bug
+  // that an external lexer introduced for sub-3-space nested lists).
+  it.each(CASES)('is idempotent (a second pass changes nothing): %s', (_label, body) => {
+    const once = serializeMarkdownBody(body)
+    expect(serializeMarkdownBody(once)).toBe(once)
+  })
+
+  it('empty and whitespace-only input produce an empty doc', () => {
+    expect(parseMarkdownToDoc('').type).toBe('doc')
+    expect(parseMarkdownToDoc('   \n\n  ').type).toBe('doc')
+    expect(splitMarkdownBlocks('')).toEqual([])
+    expect(splitMarkdownBlocks('\n\n  \n')).toEqual([])
+  })
+
+  it('parses reference-style links whole (non-chunkable) without dropping the definition', () => {
+    const body = 'See [the docs][ref] for details.\n\n[ref]: https://example.com/docs'
+    expect(serializeMarkdownBody(body)).toBe(oneShot(body))
+  })
+
+  // Block-level HTML can wrap blank lines; it routes to the whole-document fallback so chunked output
+  // still matches one-shot exactly. (Such docs open read-only via the round-trip-safety probe, so
+  // they're never re-serialized — the editor itself isn't idempotent on raw HTML.)
+  it.each([
+    ['html block', '<div class="x">\n\ncontent\n\n</div>'],
+    ['html comment', 'before\n\n<!-- a note -->\n\nafter'],
+    ['html table', '<table>\n\n<tr><td>a</td></tr>\n\n</table>'],
+  ])(
+    'block HTML renders via the whole-document fallback, matching one-shot: %s',
+    (_label, body) => {
+      expect(serializeMarkdownBody(body)).toBe(oneShot(body))
+    }
+  )
+
+  describe('splitMarkdownBlocks keeps ambiguous structures atomic', () => {
+    it('a loose list (blank lines between items) stays one block', () => {
+      expect(splitMarkdownBlocks('- a\n\n- b\n\n- c')).toEqual(['- a\n\n- b\n\n- c'])
+    })
+    it('a nested list (no blank lines) stays one block', () => {
+      expect(splitMarkdownBlocks('1. First\n  - sub\n  - two\n2. Second')).toHaveLength(1)
+    })
+    it('independent paragraphs split into separate blocks', () => {
+      expect(splitMarkdownBlocks('para one\n\npara two\n\npara three')).toHaveLength(3)
+    })
+    it('headings and paragraphs split; fenced code with blank lines stays one block', () => {
+      expect(splitMarkdownBlocks('# H\n\ntext\n\n```\na\n\nb\n```')).toEqual([
+        '# H',
+        'text',
+        '```\na\n\nb\n```',
+      ])
+    })
+    it('a multi-paragraph list item (indented continuation) stays one block', () => {
+      expect(splitMarkdownBlocks('1. first\n\n   second para\n\n2. next')).toHaveLength(1)
+    })
+  })
+
+  it('matches one-shot on a large mixed document (the case the chunker exists for)', () => {
+    const blocks: string[] = ['# Big Doc']
+    for (let i = 0; i < 300; i++) {
+      blocks.push(
+        `## Section ${i}\n\nProse with **bold** and a [link](https://x.com/${i}) and \`code\`.`
+      )
+      if (i % 10 === 0) blocks.push('```ts\nconst x = ' + i + '\n```')
+      if (i % 9 === 0) blocks.push('- item a\n\n- item b\n\n- item c')
+      if (i % 7 === 0) blocks.push('| a | b |\n| --- | --- |\n| 1 | 2 |')
+    }
+    const body = blocks.join('\n\n')
+    expect(serializeMarkdownBody(body)).toBe(oneShot(body))
+  })
+})
@@ -0,0 +1,133 @@
+import { Editor, type JSONContent } from '@tiptap/core'
+import { createMarkdownContentExtensions } from './extensions'
+
+/**
+ * A single reused editor for chunked markdown parse/serialize, created lazily so importing this
+ * module — including during SSR — never constructs it. `MarkdownManager.parse` is pure and re-entrant
+ * (it builds its own lexer and never reads the editor's document), so sharing one instance is safe;
+ * `serializeMarkdownBody` additionally reuses it as a scratchpad, overwriting its document via
+ * `setContent`. Both are safe because all access is synchronous and single-threaded — each call fully
+ * completes before the next — so no call ever observes another's partial state. One bounded instance
+ * for the session, not a per-call allocation.
+ */
+let parser: Editor | null = null
+
+function parserEditor(): Editor {
+  if (!parser) parser = new Editor({ extensions: createMarkdownContentExtensions() })
+  return parser
+}
+
+function markdownManager() {
+  const manager = parserEditor().markdown
+  if (!manager) throw new Error('Markdown extension is not installed on the parser editor')
+  return manager
+}
+
+/**
+ * Constructs whose meaning spans blank-line boundaries, so the document can't be split into blocks
+ * without changing how they parse — these documents parse whole (correct, if slower; they're
+ * uncommon and almost always round-trip-unsafe and read-only anyway):
+ * - A link/image *reference definition* (`[id]: url`) or footnote definition can sit far from its
+ *   `[text][id]` / `[^id]` use; splitting them apart would drop the reference. The editor never
+ *   *emits* reference-style links, so this only matters on the first open of such a file.
+ * - A block-level HTML element (`<div>…</div>`, `<table>…`) or HTML comment can wrap blank lines; the
+ *   splitter would shatter it (matched here by a line that opens an HTML tag/comment, not inline
+ *   `<https://…>` autolinks).
+ */
+const NON_CHUNKABLE =
+  /^[ ]{0,3}(?:\[(?:\^[^\]]+|[^\]^][^\]]*)\]:\s|<(?:!--|\/?[a-zA-Z][a-zA-Z0-9-]*[\s/>]))/m
+
+const FENCE_OPEN = /^ {0,3}(`{3,}|~{3,})/
+const FENCE_CLOSE = /^ {0,3}(`{3,}|~{3,})[ \t]*$/
+const LIST_MARKER = /^[ ]{0,3}(?:[-*+]|\d+[.)])\s/
+const BLOCKQUOTE = /^[ ]{0,3}>/
+
+/**
+ * Split a markdown body into top-level blocks that can each be parsed independently and reassembled
+ * without changing meaning. Blank lines separate candidate groups (fenced code blocks stay atomic),
+ * then adjacent groups are merged back together whenever they could form one logical block: any
+ * indented (continuation) group, and consecutive list/blockquote groups (which would otherwise be a
+ * single loose list/quote). Merging is intentionally conservative — over-merging only yields a larger
+ * chunk, whereas under-merging would shatter a structure — and every block is parsed by
+ * `@tiptap/markdown`'s own lexer, so block boundaries always match the parser.
+ */
+export function splitMarkdownBlocks(body: string): string[] {
+  const lines = body.split('\n')
+  const groups: string[] = []
+  let current: string[] = []
+  let fence: string | null = null
+  const flush = () => {
+    if (current.length > 0) groups.push(current.join('\n'))
+    current = []
+  }
+  for (const line of lines) {
+    if (fence) {
+      current.push(line)
+      const closer = line.match(FENCE_CLOSE)
+      if (closer && closer[1][0] === fence[0] && closer[1].length >= fence.length) fence = null
+      continue
+    }
+    const open = line.match(FENCE_OPEN)
+    if (open) {
+      current.push(line)
+      fence = open[1]
+      continue
+    }
+    if (line.trim() === '') {
+      flush()
+      continue
+    }
+    current.push(line)
+  }
+  flush()
+
+  const blocks: string[] = []
+  for (const group of groups) {
+    const prev = blocks.length > 0 ? blocks[blocks.length - 1] : null
+    const indented = /^\s/.test(group)
+    const continues =
+      prev !== null &&
+      (indented ||
+        (LIST_MARKER.test(prev) && LIST_MARKER.test(group)) ||
+        (BLOCKQUOTE.test(prev) && BLOCKQUOTE.test(group)))
+    if (continues) blocks[blocks.length - 1] = `${prev}\n\n${group}`
+    else blocks.push(group)
+  }
+  return blocks
+}
+
+/**
+ * Parse a markdown body into a ProseMirror doc by splitting it into top-level blocks and parsing each
+ * independently, then assembling the results.
+ *
+ * `@tiptap/markdown`'s `setContent(md, 'markdown')` is superlinear (~O(n²)) in document size, which
+ * freezes the main thread at mount for large files. Parsing block-by-block is linear — measured ~22ms
+ * vs ~1270ms at 61KB — and byte-identical, because each block is parsed with the same tokenizers.
+ * Documents whose constructs span blocks ({@link NON_CHUNKABLE}) parse whole, and any failure falls
+ * back to a single whole-document parse, so correctness never depends on the splitter.
+ */
+export function parseMarkdownToDoc(body: string): JSONContent {
+  const manager = markdownManager()
+  if (NON_CHUNKABLE.test(body)) return manager.parse(body)
+  try {
+    const content: JSONContent[] = []
+    for (const block of splitMarkdownBlocks(body)) {
+      // `MarkdownManager.parse` always returns a doc node with a `content` array; spread its blocks.
+      content.push(...(manager.parse(block).content ?? []))
+    }
+    return { type: 'doc', content }
+  } catch {
+    return manager.parse(body)
+  }
+}
+
+/**
+ * Round-trip a markdown body through the editor pipeline (chunked parse → serialize), linearly. The
+ * doc is loaded via `setContent` (not serialized directly) so it passes through the same schema
+ * normalization the live editor applies, keeping the output identical to `editor.getMarkdown()`.
+ */
+export function serializeMarkdownBody(body: string): string {
+  const editor = parserEditor()
+  editor.commands.setContent(parseMarkdownToDoc(body), { contentType: 'json' })
+  return editor.getMarkdown()
+}