From 0675b802efd0e885df4ce4de3a62e4699742c359 Mon Sep 17 00:00:00 2001
From: Jordan Coin Jackson <jordancoinjackson@gmail.com>
Date: Sat, 21 Feb 2026 23:56:17 -0500
Subject: [PATCH 1/2] Add URL support and YAML export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- URL-to-structure pipeline: Chrome headless → PDF → go-pdfium font analysis → section tree
- pdftotext integration for clean text extraction with go-pdfium fallback
- `-o` flag to export any document as structured YAML
- Heading detection via font size histogram (adapts to each document)
- Chrome auto-detection on macOS/Linux/Windows with CHROME_PATH override

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 go.mod             |  15 +-
 go.sum             |  49 +++
 main.go            |  79 ++++-
 parser/export.go   |  57 ++++
 parser/url.go      | 787 +++++++++++++++++++++++++++++++++++++++++++++
 parser/url_test.go | 355 ++++++++++++++++++++
 6 files changed, 1337 insertions(+), 5 deletions(-)
 create mode 100644 parser/export.go
 create mode 100644 parser/url.go
 create mode 100644 parser/url_test.go

diff --git a/go.mod b/go.mod
index e1e58f4..c0d8300 100644
--- a/go.mod
+++ b/go.mod
@@ -2,6 +2,17 @@ module github.com/JordanCoin/docmap
 
 go 1.25.5
 
-require github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
+require (
+	github.com/klippa-app/go-pdfium v1.17.3
+	github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
+	gopkg.in/yaml.v3 v3.0.1
+)
 
-require gopkg.in/yaml.v3 v3.0.1 // indirect
+require (
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/jolestar/go-commons-pool/v2 v2.1.2 // indirect
+	github.com/tetratelabs/wazero v1.11.0 // indirect
+	golang.org/x/net v0.50.0 // indirect
+	golang.org/x/sys v0.41.0 // indirect
+	golang.org/x/text v0.34.0 // indirect
+)
diff --git a/go.sum b/go.sum
index 24ddac1..7201de6 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,54 @@
+github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
+github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
+github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
+github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc=
+github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/jolestar/go-commons-pool/v2 v2.1.2 h1:E+XGo58F23t7HtZiC/W6jzO2Ux2IccSH/yx4nD+J1CM=
+github.com/jolestar/go-commons-pool/v2 v2.1.2/go.mod h1:r4NYccrkS5UqP1YQI1COyTZ9UjPJAAGTUxzcsK1kqhY=
+github.com/klippa-app/go-pdfium v1.17.3 h1:j+3VnnJvnVdLV16fPugN43GvucyfXIDXSg0Z7wSQ0yg=
+github.com/klippa-app/go-pdfium v1.17.3/go.mod h1:T7ZFRT9CpW8TKG+P5/4cNa/OvTzSZ+CqzasPz5UeuV4=
 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8=
 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
+github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI=
+github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
+github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
+github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/tetratelabs/wazero v1.11.0 h1:+gKemEuKCTevU4d7ZTzlsvgd1uaToIDtlQlmNbwqYhA=
+github.com/tetratelabs/wazero v1.11.0/go.mod h1:eV28rsN8Q+xwjogd7f4/Pp4xFxO7uOGbLcD/LzB1wiU=
+go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
+golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
+golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
+golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
+golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
+golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
+golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
+golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/main.go b/main.go
index 1c50b39..82ff39a 100644
--- a/main.go
+++ b/main.go
@@ -66,6 +66,7 @@ func main() {
 	var sectionFilter string
 	var expandSection string
 	var searchQuery string
+	var outputFile string
 	var showRefs bool
 	var jsonMode bool
 	for i := 2; i < len(os.Args); i++ {
@@ -85,6 +86,11 @@ func main() {
 				searchQuery = os.Args[i+1]
 				i++
 			}
+		case "--output", "-o":
+			if i+1 < len(os.Args) {
+				outputFile = os.Args[i+1]
+				i++
+			}
 		case "--refs", "-r":
 			showRefs = true
 		case "--json", "-j":
@@ -92,6 +98,49 @@ func main() {
 		}
 	}
 
+	// Check if target is a URL
+	isURL := strings.HasPrefix(target, "http://") || strings.HasPrefix(target, "https://")
+	if isURL {
+		doc, err := parser.ParseURL(target)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+			os.Exit(1)
+		}
+
+		// Use the URL's last path segment as filename
+		parts := strings.Split(strings.TrimRight(target, "/"), "/")
+		doc.Filename = parts[len(parts)-1]
+		if doc.Filename == "" {
+			doc.Filename = target
+		}
+
+		if outputFile != "" {
+			yamlContent, err := parser.ExportYAML(doc)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error exporting YAML: %v\n", err)
+				os.Exit(1)
+			}
+			if err := os.WriteFile(outputFile, []byte(yamlContent), 0644); err != nil {
+				fmt.Fprintf(os.Stderr, "Error writing file: %v\n", err)
+				os.Exit(1)
+			}
+			fmt.Fprintf(os.Stderr, "Saved to %s\n", outputFile)
+		}
+
+		if jsonMode {
+			outputJSON([]*parser.Document{doc}, target)
+		} else if searchQuery != "" {
+			render.SearchResults([]*parser.Document{doc}, searchQuery)
+		} else if expandSection != "" {
+			render.ExpandSection(doc, expandSection)
+		} else if sectionFilter != "" {
+			render.FilteredTree(doc, sectionFilter)
+		} else if outputFile == "" {
+			render.Tree(doc)
+		}
+		return
+	}
+
 	// Check if target is a directory
 	info, err := os.Stat(target)
 	if err != nil {
@@ -154,6 +203,19 @@ func main() {
 		parts := strings.Split(target, "/")
 		doc.Filename = parts[len(parts)-1]
 
+		if outputFile != "" {
+			yamlContent, err := parser.ExportYAML(doc)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error exporting YAML: %v\n", err)
+				os.Exit(1)
+			}
+			if err := os.WriteFile(outputFile, []byte(yamlContent), 0644); err != nil {
+				fmt.Fprintf(os.Stderr, "Error writing file: %v\n", err)
+				os.Exit(1)
+			}
+			fmt.Fprintf(os.Stderr, "Saved to %s\n", outputFile)
+		}
+
 		if jsonMode {
 			absPath, _ := filepath.Abs(target)
 			outputJSON([]*parser.Document{doc}, absPath)
@@ -163,7 +225,7 @@ func main() {
 			render.ExpandSection(doc, expandSection)
 		} else if sectionFilter != "" {
 			render.FilteredTree(doc, sectionFilter)
-		} else {
+		} else if outputFile == "" {
 			render.Tree(doc)
 		}
 	}
@@ -281,7 +343,7 @@ func printUsage() {
 	fmt.Println(`docmap - instant documentation structure for LLMs and humans
 
 Usage:
-  docmap <file.md|file.pdf|file.yaml|dir> [flags]
+  docmap <file.md|file.pdf|file.yaml|url|dir> [flags]
 
 Examples:
   docmap .                          # All markdown, PDF, and YAML files
@@ -292,12 +354,19 @@ Examples:
   docmap README.md --section "API"  # Filter to section
   docmap README.md --expand "API"   # Show section content
   docmap . --refs                   # Show cross-references between docs
-  docmap docs/ --search "auth"     # Search across all files
+  docmap docs/ --search "auth"      # Search across all files
+
+URL Support:
+  docmap https://example.com/docs                     # Map a web page
+  docmap https://example.com/docs --search "auth"     # Search sections
+  docmap https://example.com/docs -o docs.yaml        # Save as YAML
+  docmap docs.yaml --search "auth"                    # Fast local access
 
 Flags:
   --search <query>       Search sections across all files
   -s, --section <name>   Filter to a specific section
   -e, --expand <name>    Show full content of a section
+  -o, --output <file>    Export structure as YAML file
   -r, --refs             Show cross-references between markdown files
   -j, --json             Output JSON format
   -v, --version          Print version
@@ -311,5 +380,9 @@ YAML Support:
   Maps keys to sections with nested children. Sequences use name/id/title
   fields for titles when available, falling back to key: value or [N].
 
+URL Support:
+  Uses headless Chrome to render web pages, then extracts heading structure
+  from font sizes. Requires Chrome/Chromium installed, or set CHROME_PATH.
+
 More info: https://github.com/JordanCoin/docmap`)
 }
diff --git a/parser/export.go b/parser/export.go
new file mode 100644
index 0000000..459f5f7
--- /dev/null
+++ b/parser/export.go
@@ -0,0 +1,57 @@
+package parser
+
+import (
+	"fmt"
+
+	"gopkg.in/yaml.v3"
+)
+
+// yamlSection is the serializable form of a Section for YAML export.
+type yamlSection struct {
+	Title    string        `yaml:"title"`
+	Content  string        `yaml:"content,omitempty"`
+	Tokens   int           `yaml:"tokens"`
+	Children []yamlSection `yaml:"children,omitempty"`
+}
+
+// yamlDocument is the serializable form of a Document for YAML export.
+type yamlDocument struct {
+	Docmap   string        `yaml:"docmap"`
+	Filename string        `yaml:"filename,omitempty"`
+	Tokens   int           `yaml:"tokens"`
+	Sections []yamlSection `yaml:"sections"`
+}
+
+// ExportYAML serializes a Document to structured YAML.
+// The output can be read back by ParseYAML to reconstruct the document.
+func ExportYAML(doc *Document) (string, error) {
+	yd := yamlDocument{
+		Docmap:   "1.0",
+		Filename: doc.Filename,
+		Tokens:   doc.TotalTokens,
+		Sections: convertToYAMLSections(doc.Sections),
+	}
+
+	data, err := yaml.Marshal(yd)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal YAML: %w", err)
+	}
+
+	return string(data), nil
+}
+
+func convertToYAMLSections(sections []*Section) []yamlSection {
+	var result []yamlSection
+	for _, s := range sections {
+		ys := yamlSection{
+			Title:    s.Title,
+			Tokens:   s.Tokens,
+			Children: convertToYAMLSections(s.Children),
+		}
+		if s.Content != "" {
+			ys.Content = s.Content
+		}
+		result = append(result, ys)
+	}
+	return result
+}
diff --git a/parser/url.go b/parser/url.go
new file mode 100644
index 0000000..30b862a
--- /dev/null
+++ b/parser/url.go
@@ -0,0 +1,787 @@
+package parser
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/klippa-app/go-pdfium/requests"
+	"github.com/klippa-app/go-pdfium/responses"
+	"github.com/klippa-app/go-pdfium/webassembly"
+)
+
+// charInfo holds per-character data extracted from PDF
+type charInfo struct {
+	text     string
+	x        float64
+	y        float64
+	fontSize float64
+	isBold   bool
+	fontName string
+}
+
+// textLine represents a line of text extracted from PDF with font metadata
+type textLine struct {
+	Text     string
+	Y        float64
+	FontSize float64
+	IsBold   bool
+	FontName string
+}
+
+// headingInfo represents a detected heading with its position and level
+type headingInfo struct {
+	LineIdx int
+	Level   int
+}
+
+// ParseURL fetches a URL via headless Chrome, converts to PDF, and extracts sections.
+func ParseURL(url string) (*Document, error) {
+	chromePath, err := findChrome()
+	if err != nil {
+		return nil, fmt.Errorf("chrome not found: %w\n\nInstall Chrome or set CHROME_PATH environment variable", err)
+	}
+
+	tmpDir, err := os.MkdirTemp("", "docmap-*")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create temp dir: %w", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	pdfPath := filepath.Join(tmpDir, "page.pdf")
+	if err := urlToPDF(chromePath, url, pdfPath); err != nil {
+		return nil, fmt.Errorf("failed to generate PDF: %w", err)
+	}
+
+	lines, err := extractSpatialText(pdfPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to extract text: %w", err)
+	}
+
+	if len(lines) == 0 {
+		return &Document{Sections: []*Section{{
+			Level: 1,
+			Title: "(no extractable text)",
+		}}}, nil
+	}
+
+	headings := detectHeadings(lines)
+	doc := &Document{}
+	doc.Sections = buildSectionsFromLines(lines, headings)
+
+	for _, s := range doc.GetAllSections() {
+		doc.TotalTokens += s.Tokens
+	}
+
+	return doc, nil
+}
+
+// findChrome locates the Chrome/Chromium binary on the system.
+func findChrome() (string, error) {
+	if p := os.Getenv("CHROME_PATH"); p != "" {
+		if _, err := os.Stat(p); err == nil {
+			return p, nil
+		}
+		return "", fmt.Errorf("CHROME_PATH set but not found: %s", p)
+	}
+
+	switch runtime.GOOS {
+	case "darwin":
+		paths := []string{
+			"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+			"/Applications/Chromium.app/Contents/MacOS/Chromium",
+		}
+		for _, p := range paths {
+			if _, err := os.Stat(p); err == nil {
+				return p, nil
+			}
+		}
+	case "linux":
+		names := []string{"google-chrome", "google-chrome-stable", "chromium-browser", "chromium"}
+		for _, name := range names {
+			if p, err := exec.LookPath(name); err == nil {
+				return p, nil
+			}
+		}
+	case "windows":
+		paths := []string{
+			filepath.Join(os.Getenv("PROGRAMFILES"), "Google", "Chrome", "Application", "chrome.exe"),
+			filepath.Join(os.Getenv("PROGRAMFILES(X86)"), "Google", "Chrome", "Application", "chrome.exe"),
+			filepath.Join(os.Getenv("LOCALAPPDATA"), "Google", "Chrome", "Application", "chrome.exe"),
+		}
+		for _, p := range paths {
+			if _, err := os.Stat(p); err == nil {
+				return p, nil
+			}
+		}
+	}
+
+	return "", fmt.Errorf("Chrome/Chromium not found")
+}
+
+// urlToPDF uses headless Chrome to render a URL to PDF.
+func urlToPDF(chromePath, url, outPath string) error {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, chromePath,
+		"--headless",
+		"--disable-gpu",
+		"--no-pdf-header-footer",
+		"--print-to-pdf="+outPath,
+		url,
+	)
+	cmd.Stderr = nil
+	cmd.Stdout = nil
+
+	if err := cmd.Run(); err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			return fmt.Errorf("timed out after 30s rendering %s", url)
+		}
+		return fmt.Errorf("chrome failed: %w", err)
+	}
+
+	if _, err := os.Stat(outPath); err != nil {
+		return fmt.Errorf("PDF was not generated")
+	}
+
+	return nil
+}
+
+// extractSpatialText extracts text lines with font metadata from a PDF.
+// Uses pdftotext (poppler) for clean text when available, with go-pdfium providing
+// font size data for heading detection. Falls back to go-pdfium rects if pdftotext
+// is not installed.
+func extractSpatialText(pdfPath string) ([]textLine, error) {
+	// Try pdftotext + go-pdfium hybrid first
+	plainLines, ptErr := extractWithPdftotext(pdfPath)
+	if ptErr == nil {
+		// Get heading candidates from go-pdfium rect data
+		headingCandidates, bodySize, err := extractHeadingCandidates(pdfPath)
+		if err == nil && len(headingCandidates) > 0 {
+			return buildLinesWithHeadings(plainLines, headingCandidates, bodySize), nil
+		}
+		// If pdfium fails, return pdftotext lines without heading info
+		var result []textLine
+		for _, line := range plainLines {
+			result = append(result, textLine{Text: line, FontSize: 12})
+		}
+		return result, nil
+	}
+
+	// Fall back to go-pdfium rect-based extraction
+	return extractWithPdfium(pdfPath)
+}
+
+// headingCandidate is a heading detected from go-pdfium with its normalized text and font size.
+type headingCandidate struct {
+	normalizedText string
+	fontSize       float64
+}
+
+// extractHeadingCandidates runs go-pdfium rect extraction and heading detection,
+// returning normalized heading text + font sizes and the body font size.
+func extractHeadingCandidates(pdfPath string) ([]headingCandidate, float64, error) {
+	rectLines, err := extractWithPdfium(pdfPath)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	headings := detectHeadings(rectLines)
+	if len(headings) == 0 {
+		return nil, 0, nil
+	}
+
+	// Determine body size
+	sizeCount := make(map[float64]int)
+	for _, l := range rectLines {
+		rounded := math.Round(l.FontSize*2) / 2
+		sizeCount[rounded] += len(l.Text)
+	}
+	var bodySize float64
+	var maxCount int
+	for size, count := range sizeCount {
+		if count > maxCount {
+			maxCount = count
+			bodySize = size
+		}
+	}
+
+	var candidates []headingCandidate
+	for _, h := range headings {
+		norm := normalizeForMatch(rectLines[h.LineIdx].Text)
+		// Skip very short heading candidates — they cause false matches
+		if len(norm) < 4 {
+			continue
+		}
+		candidates = append(candidates, headingCandidate{
+			normalizedText: norm,
+			fontSize:       rectLines[h.LineIdx].FontSize,
+		})
+	}
+
+	return candidates, bodySize, nil
+}
+
+// normalizeForMatch strips spaces/punctuation and lowercases for fuzzy matching.
+func normalizeForMatch(s string) string {
+	var b strings.Builder
+	for _, r := range strings.ToLower(s) {
+		if r >= 'a' && r <= 'z' || r >= '0' && r <= '9' {
+			b.WriteRune(r)
+		}
+	}
+	return b.String()
+}
+
+// buildLinesWithHeadings assigns heading font sizes to pdftotext lines that match
+// heading candidates extracted from go-pdfium. Uses character-overlap matching
+// since go-pdfium text may have stutter prefixes (e.g. "VeVersioning" matches "Versioning").
+func buildLinesWithHeadings(plainLines []string, candidates []headingCandidate, bodySize float64) []textLine {
+	var result []textLine
+
+	candidateIdx := 0
+	for _, line := range plainLines {
+		fontSize := bodySize
+		normalized := normalizeForMatch(line)
+
+		// Check if this line matches the next heading candidate
+		if candidateIdx < len(candidates) && len(normalized) > 0 {
+			candidate := candidates[candidateIdx]
+			if isHeadingMatch(normalized, candidate.normalizedText) {
+				fontSize = candidate.fontSize
+				candidateIdx++
+			}
+		}
+
+		result = append(result, textLine{
+			Text:     line,
+			FontSize: fontSize,
+		})
+	}
+
+	return result
+}
+
+// isHeadingMatch checks if a clean pdftotext line matches a garbled go-pdfium heading.
+// The pdfium text may have stutter prefixes ("veversioning" for "versioning"), so we check
+// if the clean text is contained within the garbled text.
+func isHeadingMatch(cleanNorm, garbledNorm string) bool {
+	if len(cleanNorm) < 4 || len(garbledNorm) < 4 {
+		return false
+	}
+	if cleanNorm == garbledNorm {
+		return true
+	}
+	// Clean text should be a substring of garbled (garbled has extra stutter chars)
+	if strings.Contains(garbledNorm, cleanNorm) {
+		return true
+	}
+	// Or garbled is a substring of clean
+	if strings.Contains(cleanNorm, garbledNorm) {
+		return true
+	}
+	// Check character overlap ratio — require high overlap
+	overlap := charOverlap(cleanNorm, garbledNorm)
+	shorter := len(cleanNorm)
+	if len(garbledNorm) < shorter {
+		shorter = len(garbledNorm)
+	}
+	return float64(overlap)/float64(shorter) > 0.8
+}
+
+// charOverlap counts matching characters between two strings using a simple LCS-like approach.
+func charOverlap(a, b string) int {
+	count := 0
+	bIdx := 0
+	for _, ch := range a {
+		for bIdx < len(b) {
+			if rune(b[bIdx]) == ch {
+				count++
+				bIdx++
+				break
+			}
+			bIdx++
+		}
+	}
+	return count
+}
+
+// extractWithPdftotext shells out to pdftotext for clean text extraction.
+func extractWithPdftotext(pdfPath string) ([]string, error) {
+	pdftotextPath, err := exec.LookPath("pdftotext")
+	if err != nil {
+		return nil, fmt.Errorf("pdftotext not found: %w", err)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, pdftotextPath, pdfPath, "-")
+	output, err := cmd.Output()
+	if err != nil {
+		return nil, fmt.Errorf("pdftotext failed: %w", err)
+	}
+
+	var lines []string
+	for _, line := range strings.Split(string(output), "\n") {
+		line = strings.TrimSpace(line)
+		if line != "" {
+			lines = append(lines, line)
+		}
+	}
+	return lines, nil
+}
+
+
+// extractWithPdfium falls back to go-pdfium rect-based extraction when pdftotext is unavailable.
+func extractWithPdfium(pdfPath string) ([]textLine, error) {
+	pool, err := webassembly.Init(webassembly.Config{
+		MinIdle: 1, MaxIdle: 1, MaxTotal: 1,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to init pdfium: %w", err)
+	}
+	defer pool.Close()
+
+	instance, err := pool.GetInstance(30 * time.Second)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get pdfium instance: %w", err)
+	}
+
+	pdfData, err := os.ReadFile(pdfPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read PDF: %w", err)
+	}
+
+	doc, err := instance.OpenDocument(&requests.OpenDocument{File: &pdfData})
+	if err != nil {
+		return nil, fmt.Errorf("failed to open PDF: %w", err)
+	}
+	defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{Document: doc.Document})
+
+	pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
+	if err != nil {
+		return nil, fmt.Errorf("failed to get page count: %w", err)
+	}
+
+	var allLines []textLine
+	for i := 0; i < pageCount.PageCount; i++ {
+		textResult, err := instance.GetPageTextStructured(&requests.GetPageTextStructured{
+			Page: requests.Page{
+				ByIndex: &requests.PageByIndex{Document: doc.Document, Index: i},
+			},
+			Mode:                   requests.GetPageTextStructuredModeRects,
+			CollectFontInformation: true,
+		})
+		if err != nil {
+			continue
+		}
+		pageLines := groupRectsIntoLines(textResult.Rects)
+		allLines = append(allLines, pageLines...)
+	}
+	return allLines, nil
+}
+
+// rectInfo holds per-rect data extracted from PDF structured text
+type rectInfo struct {
+	text     string
+	x        float64
+	right    float64
+	y        float64
+	fontSize float64
+	isBold   bool
+	fontName string
+}
+
+// groupRectsIntoLines groups pre-segmented text rects by Y-coordinate into lines.
+func groupRectsIntoLines(rects []*responses.GetPageTextStructuredRect) []textLine {
+	if len(rects) == 0 {
+		return nil
+	}
+
+	var infos []rectInfo
+	for _, r := range rects {
+		if strings.TrimSpace(r.Text) == "" {
+			continue
+		}
+
+		ri := rectInfo{
+			text:  r.Text, // keep trailing spaces — they encode word boundaries
+			x:     r.PointPosition.Left,
+			right: r.PointPosition.Right,
+			y:     r.PointPosition.Top,
+		}
+
+		if r.FontInformation != nil {
+			ri.fontSize = r.FontInformation.Size
+			ri.fontName = r.FontInformation.Name
+			ri.isBold = r.FontInformation.Flags&(1<<18) != 0 ||
+				strings.Contains(strings.ToLower(r.FontInformation.Name), "bold")
+		}
+
+		infos = append(infos, ri)
+	}
+
+	if len(infos) == 0 {
+		return nil
+	}
+
+	// Sort by Y (top-to-bottom), then X (left-to-right).
+	// Use generous Y tolerance since character baselines vary within a line.
+	sort.Slice(infos, func(i, j int) bool {
+		avgSize := (infos[i].fontSize + infos[j].fontSize) / 2
+		yTol := avgSize * 0.4
+		if yTol < 2.0 {
+			yTol = 2.0
+		}
+		if math.Abs(infos[i].y-infos[j].y) <= yTol {
+			return infos[i].x < infos[j].x
+		}
+		return infos[i].y < infos[j].y
+	})
+
+	// Remove overlapping duplicate rects
+	infos = deduplicateOverlappingRects(infos)
+
+	// Group rects into lines by Y-proximity.
+	// Use wider tolerance since individual character baselines vary (ascenders/descenders).
+	var lines []textLine
+	var currentGroup []rectInfo
+	var groupMinY, groupMaxY float64
+
+	for _, ri := range infos {
+		lineHeight := ri.fontSize * 0.5
+		if lineHeight < 3.0 {
+			lineHeight = 3.0
+		}
+
+		if len(currentGroup) > 0 {
+			// Check if this rect belongs to the current line
+			withinLine := ri.y >= groupMinY-lineHeight && ri.y <= groupMaxY+lineHeight
+			if !withinLine {
+				lines = append(lines, mergeRectsToLine(currentGroup))
+				currentGroup = nil
+			}
+		}
+
+		currentGroup = append(currentGroup, ri)
+		if len(currentGroup) == 1 {
+			groupMinY = ri.y
+			groupMaxY = ri.y
+		} else {
+			if ri.y < groupMinY {
+				groupMinY = ri.y
+			}
+			if ri.y > groupMaxY {
+				groupMaxY = ri.y
+			}
+		}
+	}
+
+	if len(currentGroup) > 0 {
+		lines = append(lines, mergeRectsToLine(currentGroup))
+	}
+
+	return lines
+}
+
+// deduplicateOverlappingRects removes shorter rects that overlap with longer ones.
+// Chrome sometimes renders text twice: a short prefix then the full word at the same position.
+func deduplicateOverlappingRects(rects []rectInfo) []rectInfo {
+	if len(rects) <= 1 {
+		return rects
+	}
+
+	var result []rectInfo
+
+	for i := 0; i < len(rects); i++ {
+		if i+1 < len(rects) {
+			curr := rects[i]
+			next := rects[i+1]
+			// If next rect starts inside current rect and has longer text, skip current
+			if next.x >= curr.x-1 && next.x < curr.right &&
+				len(strings.TrimSpace(next.text)) > len(strings.TrimSpace(curr.text)) {
+				continue
+			}
+		}
+		result = append(result, rects[i])
+	}
+
+	return result
+}
+
+// mergeRectsToLine merges a group of text rects on the same line into a single textLine.
+// Rect text already includes trailing spaces for word boundaries, so we just concatenate.
+func mergeRectsToLine(rects []rectInfo) textLine {
+	var text strings.Builder
+	var totalSize float64
+	var boldCount int
+	var fontName string
+
+	for _, r := range rects {
+		text.WriteString(r.text)
+		totalSize += r.fontSize
+		if r.isBold {
+			boldCount++
+		}
+		if fontName == "" && r.fontName != "" {
+			fontName = r.fontName
+		}
+	}
+
+	avgSize := totalSize / float64(len(rects))
+
+	return textLine{
+		Text:     strings.TrimSpace(text.String()),
+		Y:        rects[0].y,
+		FontSize: avgSize,
+		IsBold:   boldCount > len(rects)/2,
+		FontName: fontName,
+	}
+}
+
+// groupCharsIntoLines groups characters by Y-coordinate into text lines.
+func groupCharsIntoLines(chars []*responses.GetPageTextStructuredChar) []textLine {
+	if len(chars) == 0 {
+		return nil
+	}
+
+	var infos []charInfo
+	for _, c := range chars {
+		if strings.TrimSpace(c.Text) == "" {
+			continue
+		}
+
+		ci := charInfo{
+			text: c.Text,
+			x:    c.PointPosition.Left,
+			y:    c.PointPosition.Top,
+		}
+
+		if c.FontInformation != nil {
+			ci.fontSize = c.FontInformation.Size
+			ci.fontName = c.FontInformation.Name
+			// PDF spec 1.7 Section 5.7.1: bit 19 (0-indexed bit 18) = ForceBold
+			ci.isBold = c.FontInformation.Flags&(1<<18) != 0 ||
+				strings.Contains(strings.ToLower(c.FontInformation.Name), "bold")
+		}
+
+		infos = append(infos, ci)
+	}
+
+	if len(infos) == 0 {
+		return nil
+	}
+
+	// Sort by Y (top-to-bottom), then X (left-to-right)
+	sort.Slice(infos, func(i, j int) bool {
+		if math.Abs(infos[i].y-infos[j].y) < 1.0 {
+			return infos[i].x < infos[j].x
+		}
+		return infos[i].y < infos[j].y
+	})
+
+	// Group into lines by Y-proximity
+	var lines []textLine
+	var currentLine []charInfo
+	currentY := infos[0].y
+
+	for _, ci := range infos {
+		tolerance := ci.fontSize * 0.3
+		if tolerance < 1.0 {
+			tolerance = 1.0
+		}
+
+		if math.Abs(ci.y-currentY) > tolerance && len(currentLine) > 0 {
+			lines = append(lines, mergeCharsToLine(currentLine))
+			currentLine = nil
+			currentY = ci.y
+		}
+
+		currentLine = append(currentLine, ci)
+		if len(currentLine) == 1 {
+			currentY = ci.y
+		}
+	}
+
+	if len(currentLine) > 0 {
+		lines = append(lines, mergeCharsToLine(currentLine))
+	}
+
+	return lines
+}
+
+// mergeCharsToLine merges a group of characters into a single textLine.
+func mergeCharsToLine(chars []charInfo) textLine {
+	var text strings.Builder
+	var totalSize float64
+	var boldCount int
+	var fontName string
+
+	for i, c := range chars {
+		if i > 0 {
+			gap := c.x - chars[i-1].x
+			charWidth := chars[i-1].fontSize * 0.6
+			if charWidth < 1 {
+				charWidth = 5
+			}
+			if gap > charWidth*1.5 {
+				text.WriteString(" ")
+			}
+		}
+		text.WriteString(c.text)
+		totalSize += c.fontSize
+		if c.isBold {
+			boldCount++
+		}
+		if fontName == "" && c.fontName != "" {
+			fontName = c.fontName
+		}
+	}
+
+	avgSize := totalSize / float64(len(chars))
+
+	return textLine{
+		Text:     text.String(),
+		Y:        chars[0].y,
+		FontSize: avgSize,
+		IsBold:   boldCount > len(chars)/2,
+		FontName: fontName,
+	}
+}
+
+// detectHeadings identifies heading lines based on font size distribution.
+// The most frequent font size is body text; larger sizes are headings.
+func detectHeadings(lines []textLine) []headingInfo {
+	if len(lines) == 0 {
+		return nil
+	}
+
+	// Build font size histogram (round to nearest 0.5pt for clustering)
+	sizeCount := make(map[float64]int)
+	for _, l := range lines {
+		rounded := math.Round(l.FontSize*2) / 2
+		sizeCount[rounded] += len(l.Text)
+	}
+
+	// Find body size (most frequent by character count)
+	var bodySize float64
+	var maxCount int
+	for size, count := range sizeCount {
+		if count > maxCount {
+			maxCount = count
+			bodySize = size
+		}
+	}
+
+	// Collect distinct heading sizes (larger than body)
+	headingSizeSet := make(map[float64]bool)
+	for size := range sizeCount {
+		if size > bodySize+0.5 {
+			headingSizeSet[size] = true
+		}
+	}
+
+	if len(headingSizeSet) == 0 {
+		return nil
+	}
+
+	// Sort heading sizes descending → largest = level 1
+	var headingSizes []float64
+	for size := range headingSizeSet {
+		headingSizes = append(headingSizes, size)
+	}
+	sort.Sort(sort.Reverse(sort.Float64Slice(headingSizes)))
+
+	sizeToLevel := make(map[float64]int)
+	for i, size := range headingSizes {
+		sizeToLevel[size] = i + 1
+	}
+
+	// Map lines to headings
+	var headings []headingInfo
+	for i, l := range lines {
+		rounded := math.Round(l.FontSize*2) / 2
+		if level, ok := sizeToLevel[rounded]; ok {
+			headings = append(headings, headingInfo{
+				LineIdx: i,
+				Level:   level,
+			})
+		}
+	}
+
+	return headings
+}
+
+// buildSectionsFromLines constructs a section tree from text lines and detected headings.
+func buildSectionsFromLines(lines []textLine, headings []headingInfo) []*Section {
+	if len(lines) == 0 {
+		return nil
+	}
+
+	// Build a set of heading line indices for quick lookup
+	headingMap := make(map[int]int) // lineIdx → level
+	for _, h := range headings {
+		headingMap[h.LineIdx] = h.Level
+	}
+
+	// If no headings detected, create a single section with all content
+	if len(headings) == 0 {
+		var content strings.Builder
+		for _, l := range lines {
+			content.WriteString(l.Text)
+			content.WriteString("\n")
+		}
+		text := strings.TrimSpace(content.String())
+		return []*Section{{
+			Level:   1,
+			Title:   truncateTitle(lines[0].Text),
+			Content: text,
+			Tokens:  estimateTokens(text),
+		}}
+	}
+
+	// Walk lines, creating sections at heading boundaries
+	var allSections []*Section
+	var currentSection *Section
+	var contentBuilder strings.Builder
+
+	for i, l := range lines {
+		if level, isHeading := headingMap[i]; isHeading {
+			// Finalize previous section
+			if currentSection != nil {
+				currentSection.Content = strings.TrimSpace(contentBuilder.String())
+				currentSection.Tokens = estimateTokens(currentSection.Content)
+			}
+
+			currentSection = &Section{
+				Level: level,
+				Title: strings.TrimSpace(l.Text),
+			}
+			allSections = append(allSections, currentSection)
+			contentBuilder.Reset()
+		} else if currentSection != nil {
+			contentBuilder.WriteString(l.Text)
+			contentBuilder.WriteString("\n")
+		}
+		// Lines before the first heading are dropped (usually nav/header chrome)
+	}
+
+	// Finalize last section
+	if currentSection != nil {
+		currentSection.Content = strings.TrimSpace(contentBuilder.String())
+		currentSection.Tokens = estimateTokens(currentSection.Content)
+	}
+
+	// Build tree and calculate cumulative tokens
+	roots := buildTree(allSections)
+	return roots
+}
diff --git a/parser/url_test.go b/parser/url_test.go
new file mode 100644
index 0000000..863bb0c
--- /dev/null
+++ b/parser/url_test.go
@@ -0,0 +1,355 @@
+package parser
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestFindChrome(t *testing.T) {
+	path, err := findChrome()
+	if err != nil {
+		// Chrome not installed is okay for CI — just verify the error is clear
+		t.Logf("Chrome not found (expected in CI): %v", err)
+		return
+	}
+	if path == "" {
+		t.Error("findChrome returned empty path with no error")
+	}
+	t.Logf("Found Chrome at: %s", path)
+}
+
+func TestFindChromeEnvOverride(t *testing.T) {
+	t.Setenv("CHROME_PATH", "/nonexistent/chrome")
+	_, err := findChrome()
+	if err == nil {
+		t.Error("expected error for nonexistent CHROME_PATH")
+	}
+}
+
+func TestFindChromePlatformPaths(t *testing.T) {
+	// Verify findChrome checks platform-appropriate paths
+	switch runtime.GOOS {
+	case "darwin", "linux", "windows":
+		// Just verify it doesn't panic
+		findChrome()
+	default:
+		t.Logf("Skipping platform test for %s", runtime.GOOS)
+	}
+}
+
+func TestDetectHeadingsBasic(t *testing.T) {
+	lines := []textLine{
+		{Text: "Main Title", FontSize: 24.0, IsBold: true},
+		{Text: "Some body text here.", FontSize: 12.0},
+		{Text: "More body text.", FontSize: 12.0},
+		{Text: "Subtitle", FontSize: 18.0, IsBold: true},
+		{Text: "Body under subtitle.", FontSize: 12.0},
+		{Text: "Another body line.", FontSize: 12.0},
+	}
+
+	headings := detectHeadings(lines)
+	if len(headings) != 2 {
+		t.Fatalf("expected 2 headings, got %d", len(headings))
+	}
+
+	// "Main Title" at 24pt should be level 1
+	if headings[0].LineIdx != 0 {
+		t.Errorf("expected first heading at line 0, got %d", headings[0].LineIdx)
+	}
+	if headings[0].Level != 1 {
+		t.Errorf("expected level 1 for Main Title, got %d", headings[0].Level)
+	}
+
+	// "Subtitle" at 18pt should be level 2
+	if headings[1].LineIdx != 3 {
+		t.Errorf("expected second heading at line 3, got %d", headings[1].LineIdx)
+	}
+	if headings[1].Level != 2 {
+		t.Errorf("expected level 2 for Subtitle, got %d", headings[1].Level)
+	}
+}
+
+func TestDetectHeadingsThreeLevels(t *testing.T) {
+	lines := []textLine{
+		{Text: "H1", FontSize: 28.0},
+		{Text: "body", FontSize: 12.0},
+		{Text: "body", FontSize: 12.0},
+		{Text: "body", FontSize: 12.0},
+		{Text: "H2", FontSize: 20.0},
+		{Text: "body", FontSize: 12.0},
+		{Text: "H3", FontSize: 16.0},
+		{Text: "body", FontSize: 12.0},
+	}
+
+	headings := detectHeadings(lines)
+	if len(headings) != 3 {
+		t.Fatalf("expected 3 headings, got %d", len(headings))
+	}
+
+	if headings[0].Level != 1 {
+		t.Errorf("expected H1 level 1, got %d", headings[0].Level)
+	}
+	if headings[1].Level != 2 {
+		t.Errorf("expected H2 level 2, got %d", headings[1].Level)
+	}
+	if headings[2].Level != 3 {
+		t.Errorf("expected H3 level 3, got %d", headings[2].Level)
+	}
+}
+
+func TestDetectHeadingsNoHeadings(t *testing.T) {
+	lines := []textLine{
+		{Text: "All same size.", FontSize: 12.0},
+		{Text: "Still same size.", FontSize: 12.0},
+		{Text: "Yep same size.", FontSize: 12.0},
+	}
+
+	headings := detectHeadings(lines)
+	if len(headings) != 0 {
+		t.Errorf("expected 0 headings for uniform text, got %d", len(headings))
+	}
+}
+
+func TestDetectHeadingsEmpty(t *testing.T) {
+	headings := detectHeadings(nil)
+	if headings != nil {
+		t.Errorf("expected nil for empty input, got %v", headings)
+	}
+}
+
+func TestDetectHeadingsBodySizeByCharCount(t *testing.T) {
+	// Body text has more total characters even if heading lines are numerous
+	lines := []textLine{
+		{Text: "Title", FontSize: 24.0},
+		{Text: "This is a much longer body text paragraph with many words in it.", FontSize: 12.0},
+		{Text: "Another long paragraph of body text that contains a lot of content.", FontSize: 12.0},
+	}
+
+	headings := detectHeadings(lines)
+	if len(headings) != 1 {
+		t.Fatalf("expected 1 heading, got %d", len(headings))
+	}
+	if headings[0].LineIdx != 0 {
+		t.Errorf("expected heading at line 0, got %d", headings[0].LineIdx)
+	}
+}
+
+func TestBuildSectionsFromLinesBasic(t *testing.T) {
+	lines := []textLine{
+		{Text: "Introduction", FontSize: 24.0},
+		{Text: "Welcome to our docs.", FontSize: 12.0},
+		{Text: "Getting Started", FontSize: 24.0},
+		{Text: "Install the package.", FontSize: 12.0},
+		{Text: "Run the setup command.", FontSize: 12.0},
+	}
+
+	headings := []headingInfo{
+		{LineIdx: 0, Level: 1},
+		{LineIdx: 2, Level: 1},
+	}
+
+	sections := buildSectionsFromLines(lines, headings)
+	if len(sections) != 2 {
+		t.Fatalf("expected 2 sections, got %d", len(sections))
+	}
+
+	if sections[0].Title != "Introduction" {
+		t.Errorf("expected 'Introduction', got '%s'", sections[0].Title)
+	}
+	if sections[0].Content != "Welcome to our docs." {
+		t.Errorf("expected body content, got '%s'", sections[0].Content)
+	}
+
+	if sections[1].Title != "Getting Started" {
+		t.Errorf("expected 'Getting Started', got '%s'", sections[1].Title)
+	}
+	if sections[1].Tokens == 0 {
+		t.Error("expected non-zero tokens for section with content")
+	}
+}
+
+func TestBuildSectionsFromLinesNested(t *testing.T) {
+	lines := []textLine{
+		{Text: "Chapter 1", FontSize: 24.0},
+		{Text: "Chapter intro.", FontSize: 12.0},
+		{Text: "Section 1.1", FontSize: 18.0},
+		{Text: "Section content.", FontSize: 12.0},
+	}
+
+	headings := []headingInfo{
+		{LineIdx: 0, Level: 1},
+		{LineIdx: 2, Level: 2},
+	}
+
+	sections := buildSectionsFromLines(lines, headings)
+	if len(sections) != 1 {
+		t.Fatalf("expected 1 root section, got %d", len(sections))
+	}
+
+	if sections[0].Title != "Chapter 1" {
+		t.Errorf("expected 'Chapter 1', got '%s'", sections[0].Title)
+	}
+	if len(sections[0].Children) != 1 {
+		t.Fatalf("expected 1 child section, got %d", len(sections[0].Children))
+	}
+	if sections[0].Children[0].Title != "Section 1.1" {
+		t.Errorf("expected 'Section 1.1', got '%s'", sections[0].Children[0].Title)
+	}
+}
+
+func TestBuildSectionsFromLinesNoHeadings(t *testing.T) {
+	lines := []textLine{
+		{Text: "Just some text.", FontSize: 12.0},
+		{Text: "More text here.", FontSize: 12.0},
+	}
+
+	sections := buildSectionsFromLines(lines, nil)
+	if len(sections) != 1 {
+		t.Fatalf("expected 1 fallback section, got %d", len(sections))
+	}
+	if sections[0].Title != "Just some text." {
+		t.Errorf("expected first line as title, got '%s'", sections[0].Title)
+	}
+}
+
+func TestBuildSectionsFromLinesEmpty(t *testing.T) {
+	sections := buildSectionsFromLines(nil, nil)
+	if sections != nil {
+		t.Errorf("expected nil for empty input, got %v", sections)
+	}
+}
+
+func TestBuildSectionsFromLinesPreHeadingContent(t *testing.T) {
+	// Lines before the first heading should be dropped (nav/header noise)
+	lines := []textLine{
+		{Text: "Nav link 1", FontSize: 10.0},
+		{Text: "Nav link 2", FontSize: 10.0},
+		{Text: "Real Title", FontSize: 24.0},
+		{Text: "Real content.", FontSize: 12.0},
+	}
+
+	headings := []headingInfo{
+		{LineIdx: 2, Level: 1},
+	}
+
+	sections := buildSectionsFromLines(lines, headings)
+	if len(sections) != 1 {
+		t.Fatalf("expected 1 section, got %d", len(sections))
+	}
+	if sections[0].Title != "Real Title" {
+		t.Errorf("expected 'Real Title', got '%s'", sections[0].Title)
+	}
+}
+
+func TestMergeCharsToLine(t *testing.T) {
+	chars := []charInfo{
+		{text: "H", x: 10, fontSize: 24, isBold: true, fontName: "Arial-Bold"},
+		{text: "e", x: 22, fontSize: 24, isBold: true, fontName: "Arial-Bold"},
+		{text: "l", x: 34, fontSize: 24, isBold: true, fontName: "Arial-Bold"},
+		{text: "l", x: 46, fontSize: 24, isBold: true, fontName: "Arial-Bold"},
+		{text: "o", x: 58, fontSize: 24, isBold: true, fontName: "Arial-Bold"},
+	}
+
+	line := mergeCharsToLine(chars)
+	if line.Text != "Hello" {
+		t.Errorf("expected 'Hello', got '%s'", line.Text)
+	}
+	if !line.IsBold {
+		t.Error("expected bold line")
+	}
+	if line.FontSize != 24.0 {
+		t.Errorf("expected font size 24.0, got %f", line.FontSize)
+	}
+	if line.FontName != "Arial-Bold" {
+		t.Errorf("expected font name 'Arial-Bold', got '%s'", line.FontName)
+	}
+}
+
+func TestMergeCharsToLineWithSpaces(t *testing.T) {
+	// Characters with a large gap should produce a space
+	chars := []charInfo{
+		{text: "A", x: 10, fontSize: 12, fontName: "Arial"},
+		{text: "B", x: 17, fontSize: 12, fontName: "Arial"}, // close (gap 7 < 10.8)
+		{text: "C", x: 80, fontSize: 12, fontName: "Arial"},  // far away
+	}
+
+	line := mergeCharsToLine(chars)
+	if line.Text != "AB C" {
+		t.Errorf("expected 'AB C', got '%s'", line.Text)
+	}
+}
+
+func TestExportYAMLRoundTrip(t *testing.T) {
+	doc := &Document{
+		Filename:    "test.md",
+		TotalTokens: 100,
+		Sections: []*Section{
+			{
+				Level:   1,
+				Title:   "Introduction",
+				Content: "Welcome to the docs.",
+				Tokens:  50,
+				Children: []*Section{
+					{
+						Level:   2,
+						Title:   "Getting Started",
+						Content: "Install the package.",
+						Tokens:  25,
+					},
+				},
+			},
+			{
+				Level:   1,
+				Title:   "API Reference",
+				Content: "Endpoint docs.",
+				Tokens:  25,
+			},
+		},
+	}
+
+	yamlContent, err := ExportYAML(doc)
+	if err != nil {
+		t.Fatalf("ExportYAML failed: %v", err)
+	}
+
+	if yamlContent == "" {
+		t.Fatal("expected non-empty YAML output")
+	}
+
+	// Parse it back
+	parsed, err := ParseYAML(yamlContent)
+	if err != nil {
+		t.Fatalf("ParseYAML of exported content failed: %v", err)
+	}
+
+	// Verify structure was preserved
+	if len(parsed.Sections) == 0 {
+		t.Fatal("expected sections in round-tripped document")
+	}
+
+	// Find the "sections" key and verify it has children
+	var sectionsNode *Section
+	for _, s := range parsed.Sections {
+		if s.Title == "sections" {
+			sectionsNode = s
+			break
+		}
+	}
+	if sectionsNode == nil {
+		t.Fatal("expected 'sections' key in parsed YAML")
+	}
+	if len(sectionsNode.Children) != 2 {
+		t.Errorf("expected 2 section children, got %d", len(sectionsNode.Children))
+	}
+}
+
+func TestExportYAMLEmpty(t *testing.T) {
+	doc := &Document{}
+	yamlContent, err := ExportYAML(doc)
+	if err != nil {
+		t.Fatalf("ExportYAML failed on empty doc: %v", err)
+	}
+	if yamlContent == "" {
+		t.Fatal("expected non-empty YAML even for empty doc")
+	}
+}

From fc593ffd64109db16b25d554650107621813a37d Mon Sep 17 00:00:00 2001
From: Jordan Coin Jackson <jordancoinjackson@gmail.com>
Date: Sun, 22 Feb 2026 00:16:49 -0500
Subject: [PATCH 2/2] Add HTML-first approach for URL parsing

Try parsing semantic HTML headings (h1-h6) before falling back to
Chrome/PDF pipeline. Works instantly on SSR doc sites (Mintlify,
Docusaurus, etc.) with perfect heading detection. Chrome/PDF remains
as fallback for JS-only SPAs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 parser/html.go      | 244 ++++++++++++++++++++++++++++++++++++++++++++
 parser/html_test.go | 216 +++++++++++++++++++++++++++++++++++++++
 parser/url.go       |  15 ++-
 3 files changed, 474 insertions(+), 1 deletion(-)
 create mode 100644 parser/html.go
 create mode 100644 parser/html_test.go

diff --git a/parser/html.go b/parser/html.go
new file mode 100644
index 0000000..cda60b0
--- /dev/null
+++ b/parser/html.go
@@ -0,0 +1,244 @@
+package parser
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"golang.org/x/net/html"
+)
+
+// htmlSection is an intermediate representation of a section extracted from HTML.
+type htmlSection struct {
+	level   int
+	title   string
+	content strings.Builder
+}
+
+// parseHTMLFromURL fetches a URL and tries to extract sections from semantic HTML headings.
+// Returns nil, nil if the HTML has no usable heading structure (e.g. JS-only SPA).
+func parseHTMLFromURL(url string) (*Document, error) {
+	client := &http.Client{Timeout: 15 * time.Second}
+	resp, err := client.Get(url)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch URL: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != 200 {
+		return nil, fmt.Errorf("HTTP %d fetching %s", resp.StatusCode, url)
+	}
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit
+	if err != nil {
+		return nil, fmt.Errorf("failed to read response: %w", err)
+	}
+
+	return parseHTMLContent(string(body))
+}
+
+// parseHTMLContent extracts document sections from HTML content using semantic heading tags.
+// Returns nil, nil if no usable heading structure is found.
+func parseHTMLContent(htmlContent string) (*Document, error) {
+	node, err := html.Parse(strings.NewReader(htmlContent))
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse HTML: %w", err)
+	}
+
+	// Find the best content root — prefer <main>, <article>, or element with role="main"
+	contentRoot := findContentRoot(node)
+	if contentRoot == nil {
+		contentRoot = findBodyElement(node)
+	}
+	if contentRoot == nil {
+		return nil, nil
+	}
+
+	// Walk the DOM and extract headings + body text in document order
+	var sections []htmlSection
+	var current *htmlSection
+
+	var walk func(*html.Node)
+	walk = func(n *html.Node) {
+		// Skip nav, header, footer, sidebar elements
+		if shouldSkipElement(n) {
+			return
+		}
+
+		if n.Type == html.ElementNode {
+			level := headingLevel(n)
+			if level > 0 {
+				title := extractText(n)
+				title = strings.TrimSpace(title)
+				if title == "" {
+					// Skip empty headings
+					for c := n.FirstChild; c != nil; c = c.NextSibling {
+						walk(c)
+					}
+					return
+				}
+
+				// Finalize previous section
+				if current != nil {
+					sections = append(sections, *current)
+				}
+
+				current = &htmlSection{level: level, title: title}
+				return // Don't recurse into heading children (already extracted text)
+			}
+		}
+
+		if n.Type == html.TextNode && current != nil {
+			text := strings.TrimSpace(n.Data)
+			if text != "" {
+				current.content.WriteString(text)
+				current.content.WriteString(" ")
+			}
+		}
+
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walk(c)
+		}
+	}
+
+	walk(contentRoot)
+
+	// Finalize last section
+	if current != nil {
+		sections = append(sections, *current)
+	}
+
+	// No headings found — HTML is probably a JS SPA shell
+	if len(sections) == 0 {
+		return nil, nil
+	}
+
+	// Convert to Section structs
+	var allSections []*Section
+	for _, hs := range sections {
+		content := strings.TrimSpace(hs.content.String())
+		s := &Section{
+			Level:   hs.level,
+			Title:   hs.title,
+			Content: content,
+			Tokens:  estimateTokens(content),
+		}
+		allSections = append(allSections, s)
+	}
+
+	doc := &Document{
+		Sections: buildTree(allSections),
+	}
+	for _, s := range allSections {
+		doc.TotalTokens += s.Tokens
+	}
+
+	return doc, nil
+}
+
+// headingLevel returns the heading level (1-6) for h1-h6 elements, or 0 for non-headings.
+func headingLevel(n *html.Node) int {
+	if n.Type != html.ElementNode {
+		return 0
+	}
+	switch n.Data {
+	case "h1":
+		return 1
+	case "h2":
+		return 2
+	case "h3":
+		return 3
+	case "h4":
+		return 4
+	case "h5":
+		return 5
+	case "h6":
+		return 6
+	}
+	return 0
+}
+
+// shouldSkipElement returns true for elements that typically contain navigation/chrome, not content.
+func shouldSkipElement(n *html.Node) bool {
+	if n.Type != html.ElementNode {
+		return false
+	}
+
+	// Skip nav, header, footer elements
+	switch n.Data {
+	case "nav", "footer", "noscript", "script", "style", "svg", "iframe":
+		return true
+	}
+
+	for _, attr := range n.Attr {
+		val := strings.ToLower(attr.Val)
+
+		// Skip elements with sidebar/nav roles or IDs
+		if attr.Key == "role" && (val == "navigation" || val == "banner" || val == "contentinfo") {
+			return true
+		}
+		if attr.Key == "id" && (val == "sidebar-title" || strings.Contains(val, "sidebar") || strings.Contains(val, "nav")) {
+			return true
+		}
+		// Skip elements with common nav/sidebar classes
+		if attr.Key == "class" {
+			if strings.Contains(val, "sidebar") || strings.Contains(val, "nav-") ||
+				strings.Contains(val, "navigation") || strings.Contains(val, "toc") {
+				return true
+			}
+		}
+		// Skip hidden elements
+		if attr.Key == "hidden" || (attr.Key == "aria-hidden" && val == "true") {
+			return true
+		}
+	}
+
+	return false
+}
+
+// findContentRoot looks for a <main>, <article>, or element with role="main".
+func findContentRoot(n *html.Node) *html.Node {
+	if n.Type == html.ElementNode {
+		if n.Data == "main" || n.Data == "article" {
+			return n
+		}
+		for _, attr := range n.Attr {
+			if attr.Key == "role" && attr.Val == "main" {
+				return n
+			}
+		}
+	}
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		if found := findContentRoot(c); found != nil {
+			return found
+		}
+	}
+	return nil
+}
+
+// findBodyElement returns the <body> element, or nil.
+func findBodyElement(n *html.Node) *html.Node {
+	if n.Type == html.ElementNode && n.Data == "body" {
+		return n
+	}
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		if found := findBodyElement(c); found != nil {
+			return found
+		}
+	}
+	return nil
+}
+
+// extractText recursively extracts all text content from a node.
+func extractText(n *html.Node) string {
+	if n.Type == html.TextNode {
+		return n.Data
+	}
+	var b strings.Builder
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		b.WriteString(extractText(c))
+	}
+	return b.String()
+}
diff --git a/parser/html_test.go b/parser/html_test.go
new file mode 100644
index 0000000..2ab95d9
--- /dev/null
+++ b/parser/html_test.go
@@ -0,0 +1,216 @@
+package parser
+
+import (
+	"testing"
+)
+
+func TestParseHTMLContentBasic(t *testing.T) {
+	html := `<html><body>
+		<h1>Getting Started</h1>
+		<p>Welcome to the docs.</p>
+		<h2>Installation</h2>
+		<p>Run npm install.</p>
+		<h2>Configuration</h2>
+		<p>Edit the config file.</p>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	if len(doc.Sections) != 1 {
+		t.Fatalf("expected 1 root section, got %d", len(doc.Sections))
+	}
+	if doc.Sections[0].Title != "Getting Started" {
+		t.Errorf("expected 'Getting Started', got '%s'", doc.Sections[0].Title)
+	}
+	if len(doc.Sections[0].Children) != 2 {
+		t.Fatalf("expected 2 children, got %d", len(doc.Sections[0].Children))
+	}
+	if doc.Sections[0].Children[0].Title != "Installation" {
+		t.Errorf("expected 'Installation', got '%s'", doc.Sections[0].Children[0].Title)
+	}
+	if doc.Sections[0].Children[1].Title != "Configuration" {
+		t.Errorf("expected 'Configuration', got '%s'", doc.Sections[0].Children[1].Title)
+	}
+}
+
+func TestParseHTMLContentSkipsNav(t *testing.T) {
+	html := `<html><body>
+		<nav><h5>Sidebar Title</h5><a href="/">Home</a></nav>
+		<h1>Main Content</h1>
+		<p>Real content here.</p>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	if len(doc.Sections) != 1 {
+		t.Fatalf("expected 1 section (nav skipped), got %d", len(doc.Sections))
+	}
+	if doc.Sections[0].Title != "Main Content" {
+		t.Errorf("expected 'Main Content', got '%s'", doc.Sections[0].Title)
+	}
+}
+
+func TestParseHTMLContentSkipsSidebar(t *testing.T) {
+	html := `<html><body>
+		<div class="sidebar"><h5>Nav Section</h5></div>
+		<h1>Page Title</h1>
+		<p>Body text.</p>
+		<footer><h6>Footer Heading</h6></footer>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	if len(doc.Sections) != 1 {
+		t.Fatalf("expected 1 section, got %d", len(doc.Sections))
+	}
+	if doc.Sections[0].Title != "Page Title" {
+		t.Errorf("expected 'Page Title', got '%s'", doc.Sections[0].Title)
+	}
+}
+
+func TestParseHTMLContentNoHeadings(t *testing.T) {
+	html := `<html><body><p>Just a paragraph.</p><p>No headings here.</p></body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc != nil {
+		t.Errorf("expected nil for no headings, got %+v", doc)
+	}
+}
+
+func TestParseHTMLContentEmpty(t *testing.T) {
+	doc, err := parseHTMLContent("")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc != nil {
+		t.Errorf("expected nil for empty HTML, got %+v", doc)
+	}
+}
+
+func TestParseHTMLContentThreeLevels(t *testing.T) {
+	html := `<html><body>
+		<h1>Chapter 1</h1>
+		<p>Chapter intro.</p>
+		<h2>Section 1.1</h2>
+		<p>Section content.</p>
+		<h3>Subsection 1.1.1</h3>
+		<p>Detail content.</p>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	if len(doc.Sections) != 1 {
+		t.Fatalf("expected 1 root, got %d", len(doc.Sections))
+	}
+	root := doc.Sections[0]
+	if root.Title != "Chapter 1" {
+		t.Errorf("expected 'Chapter 1', got '%s'", root.Title)
+	}
+	if len(root.Children) != 1 {
+		t.Fatalf("expected 1 child of root, got %d", len(root.Children))
+	}
+	child := root.Children[0]
+	if child.Title != "Section 1.1" {
+		t.Errorf("expected 'Section 1.1', got '%s'", child.Title)
+	}
+	if len(child.Children) != 1 {
+		t.Fatalf("expected 1 grandchild, got %d", len(child.Children))
+	}
+	if child.Children[0].Title != "Subsection 1.1.1" {
+		t.Errorf("expected 'Subsection 1.1.1', got '%s'", child.Children[0].Title)
+	}
+}
+
+func TestParseHTMLContentExtractsBodyText(t *testing.T) {
+	html := `<html><body>
+		<h1>Title</h1>
+		<p>First paragraph.</p>
+		<p>Second paragraph.</p>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	content := doc.Sections[0].Content
+	if content == "" {
+		t.Error("expected non-empty content")
+	}
+	if doc.Sections[0].Tokens == 0 {
+		t.Error("expected non-zero tokens")
+	}
+}
+
+func TestParseHTMLContentPrefersMainElement(t *testing.T) {
+	html := `<html><body>
+		<header><h1>Site Header</h1></header>
+		<main>
+			<h1>Page Title</h1>
+			<p>Content.</p>
+		</main>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	if len(doc.Sections) != 1 {
+		t.Fatalf("expected 1 section, got %d", len(doc.Sections))
+	}
+	// Should pick up "Page Title" from <main>, not "Site Header" from <header>
+	if doc.Sections[0].Title != "Page Title" {
+		t.Errorf("expected 'Page Title', got '%s'", doc.Sections[0].Title)
+	}
+}
+
+func TestParseHTMLContentSkipsScriptStyle(t *testing.T) {
+	html := `<html><body>
+		<script>var x = "heading h1";</script>
+		<style>.h1 { color: red; }</style>
+		<h1>Real Title</h1>
+		<p>Content here.</p>
+	</body></html>`
+
+	doc, err := parseHTMLContent(html)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc == nil {
+		t.Fatal("expected document, got nil")
+	}
+	if len(doc.Sections) != 1 {
+		t.Fatalf("expected 1 section, got %d", len(doc.Sections))
+	}
+	if doc.Sections[0].Title != "Real Title" {
+		t.Errorf("expected 'Real Title', got '%s'", doc.Sections[0].Title)
+	}
+}
diff --git a/parser/url.go b/parser/url.go
index 30b862a..5465b7b 100644
--- a/parser/url.go
+++ b/parser/url.go
@@ -42,8 +42,21 @@ type headingInfo struct {
 	Level   int
 }
 
-// ParseURL fetches a URL via headless Chrome, converts to PDF, and extracts sections.
+// ParseURL fetches a URL and extracts document sections.
+// Tries HTML parsing first (fast, accurate for SSR sites), falls back to Chrome/PDF.
 func ParseURL(url string) (*Document, error) {
+	// Try HTML approach first — works for SSR sites (most doc sites)
+	doc, err := parseHTMLFromURL(url)
+	if err == nil && doc != nil && len(doc.Sections) > 0 {
+		return doc, nil
+	}
+
+	// Fall back to Chrome → PDF → text extraction
+	return parseURLviaPDF(url)
+}
+
+// parseURLviaPDF uses headless Chrome to render a URL to PDF, then extracts sections.
+func parseURLviaPDF(url string) (*Document, error) {
 	chromePath, err := findChrome()
 	if err != nil {
 		return nil, fmt.Errorf("chrome not found: %w\n\nInstall Chrome or set CHROME_PATH environment variable", err)