From 0675b802efd0e885df4ce4de3a62e4699742c359 Mon Sep 17 00:00:00 2001 From: Jordan Coin Jackson Date: Sat, 21 Feb 2026 23:56:17 -0500 Subject: [PATCH 1/2] Add URL support and YAML export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - URL-to-structure pipeline: Chrome headless → PDF → go-pdfium font analysis → section tree - pdftotext integration for clean text extraction with go-pdfium fallback - `-o` flag to export any document as structured YAML - Heading detection via font size histogram (adapts to each document) - Chrome auto-detection on macOS/Linux/Windows with CHROME_PATH override Co-Authored-By: Claude Opus 4.6 --- go.mod | 15 +- go.sum | 49 +++ main.go | 79 ++++- parser/export.go | 57 ++++ parser/url.go | 787 +++++++++++++++++++++++++++++++++++++++++++++ parser/url_test.go | 355 ++++++++++++++++++++ 6 files changed, 1337 insertions(+), 5 deletions(-) create mode 100644 parser/export.go create mode 100644 parser/url.go create mode 100644 parser/url_test.go diff --git a/go.mod b/go.mod index e1e58f4..c0d8300 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,17 @@ module github.com/JordanCoin/docmap go 1.25.5 -require github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 +require ( + github.com/klippa-app/go-pdfium v1.17.3 + github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 + gopkg.in/yaml.v3 v3.0.1 +) -require gopkg.in/yaml.v3 v3.0.1 // indirect +require ( + github.com/google/uuid v1.6.0 // indirect + github.com/jolestar/go-commons-pool/v2 v2.1.2 // indirect + github.com/tetratelabs/wazero v1.11.0 // indirect + golang.org/x/net v0.50.0 // indirect + golang.org/x/sys v0.41.0 // indirect + golang.org/x/text v0.34.0 // indirect +) diff --git a/go.sum b/go.sum index 24ddac1..7201de6 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,54 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jolestar/go-commons-pool/v2 v2.1.2 h1:E+XGo58F23t7HtZiC/W6jzO2Ux2IccSH/yx4nD+J1CM= +github.com/jolestar/go-commons-pool/v2 v2.1.2/go.mod h1:r4NYccrkS5UqP1YQI1COyTZ9UjPJAAGTUxzcsK1kqhY= +github.com/klippa-app/go-pdfium v1.17.3 h1:j+3VnnJvnVdLV16fPugN43GvucyfXIDXSg0Z7wSQ0yg= +github.com/klippa-app/go-pdfium v1.17.3/go.mod h1:T7ZFRT9CpW8TKG+P5/4cNa/OvTzSZ+CqzasPz5UeuV4= github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8= github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4= +github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= +github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= +github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= +github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tetratelabs/wazero v1.11.0 h1:+gKemEuKCTevU4d7ZTzlsvgd1uaToIDtlQlmNbwqYhA= +github.com/tetratelabs/wazero v1.11.0/go.mod h1:eV28rsN8Q+xwjogd7f4/Pp4xFxO7uOGbLcD/LzB1wiU= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= +golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= +golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go index 1c50b39..82ff39a 100644 --- a/main.go +++ b/main.go @@ -66,6 +66,7 @@ func main() { var sectionFilter string var expandSection string var searchQuery string + var outputFile string var showRefs bool var jsonMode bool for i := 2; i < len(os.Args); i++ { @@ -85,6 +86,11 @@ func main() { searchQuery = os.Args[i+1] i++ } + case "--output", "-o": + if i+1 < len(os.Args) { + outputFile = os.Args[i+1] + i++ + } case "--refs", "-r": showRefs = true case "--json", "-j": @@ -92,6 +98,49 @@ func main() { } } + // Check if target is a URL + isURL := strings.HasPrefix(target, "http://") || strings.HasPrefix(target, "https://") + if isURL { + doc, err := parser.ParseURL(target) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + // Use the URL's last path segment as filename + parts := strings.Split(strings.TrimRight(target, "/"), "/") + doc.Filename = parts[len(parts)-1] + if doc.Filename == "" { + doc.Filename = target + } + + if outputFile != "" { + yamlContent, err := parser.ExportYAML(doc) + if err != nil { + fmt.Fprintf(os.Stderr, "Error exporting YAML: %v\n", err) + os.Exit(1) + } + if err := os.WriteFile(outputFile, []byte(yamlContent), 0644); err != nil { + fmt.Fprintf(os.Stderr, "Error writing file: %v\n", err) + os.Exit(1) + } + fmt.Fprintf(os.Stderr, "Saved to %s\n", outputFile) + } + + if jsonMode { + outputJSON([]*parser.Document{doc}, target) + } else if searchQuery != "" { + render.SearchResults([]*parser.Document{doc}, searchQuery) + } else if expandSection != "" { + render.ExpandSection(doc, expandSection) + } else if sectionFilter != "" { + render.FilteredTree(doc, sectionFilter) + } else if outputFile == "" { + render.Tree(doc) + } + return + } + // Check if target is a directory info, err := os.Stat(target) if err != nil { @@ -154,6 +203,19 @@ func main() { parts := strings.Split(target, "/") doc.Filename = parts[len(parts)-1] + if outputFile != "" { + yamlContent, err := parser.ExportYAML(doc) + if err != nil { + fmt.Fprintf(os.Stderr, "Error exporting YAML: %v\n", err) + os.Exit(1) + } + if err := os.WriteFile(outputFile, []byte(yamlContent), 0644); err != nil { + fmt.Fprintf(os.Stderr, "Error writing file: %v\n", err) + os.Exit(1) + } + fmt.Fprintf(os.Stderr, "Saved to %s\n", outputFile) + } + if jsonMode { absPath, _ := filepath.Abs(target) outputJSON([]*parser.Document{doc}, absPath) @@ -163,7 +225,7 @@ func main() { render.ExpandSection(doc, expandSection) } else if sectionFilter != "" { render.FilteredTree(doc, sectionFilter) - } else { + } else if outputFile == "" { render.Tree(doc) } } @@ -281,7 +343,7 @@ func printUsage() { fmt.Println(`docmap - instant documentation structure for LLMs and humans Usage: - docmap [flags] + docmap [flags] Examples: docmap . # All markdown, PDF, and YAML files @@ -292,12 +354,19 @@ Examples: docmap README.md --section "API" # Filter to section docmap README.md --expand "API" # Show section content docmap . --refs # Show cross-references between docs - docmap docs/ --search "auth" # Search across all files + docmap docs/ --search "auth" # Search across all files + +URL Support: + docmap https://example.com/docs # Map a web page + docmap https://example.com/docs --search "auth" # Search sections + docmap https://example.com/docs -o docs.yaml # Save as YAML + docmap docs.yaml --search "auth" # Fast local access Flags: --search Search sections across all files -s, --section Filter to a specific section -e, --expand Show full content of a section + -o, --output Export structure as YAML file -r, --refs Show cross-references between markdown files -j, --json Output JSON format -v, --version Print version @@ -311,5 +380,9 @@ YAML Support: Maps keys to sections with nested children. Sequences use name/id/title fields for titles when available, falling back to key: value or [N]. +URL Support: + Uses headless Chrome to render web pages, then extracts heading structure + from font sizes. Requires Chrome/Chromium installed, or set CHROME_PATH. + More info: https://github.com/JordanCoin/docmap`) } diff --git a/parser/export.go b/parser/export.go new file mode 100644 index 0000000..459f5f7 --- /dev/null +++ b/parser/export.go @@ -0,0 +1,57 @@ +package parser + +import ( + "fmt" + + "gopkg.in/yaml.v3" +) + +// yamlSection is the serializable form of a Section for YAML export. +type yamlSection struct { + Title string `yaml:"title"` + Content string `yaml:"content,omitempty"` + Tokens int `yaml:"tokens"` + Children []yamlSection `yaml:"children,omitempty"` +} + +// yamlDocument is the serializable form of a Document for YAML export. +type yamlDocument struct { + Docmap string `yaml:"docmap"` + Filename string `yaml:"filename,omitempty"` + Tokens int `yaml:"tokens"` + Sections []yamlSection `yaml:"sections"` +} + +// ExportYAML serializes a Document to structured YAML. +// The output can be read back by ParseYAML to reconstruct the document. +func ExportYAML(doc *Document) (string, error) { + yd := yamlDocument{ + Docmap: "1.0", + Filename: doc.Filename, + Tokens: doc.TotalTokens, + Sections: convertToYAMLSections(doc.Sections), + } + + data, err := yaml.Marshal(yd) + if err != nil { + return "", fmt.Errorf("failed to marshal YAML: %w", err) + } + + return string(data), nil +} + +func convertToYAMLSections(sections []*Section) []yamlSection { + var result []yamlSection + for _, s := range sections { + ys := yamlSection{ + Title: s.Title, + Tokens: s.Tokens, + Children: convertToYAMLSections(s.Children), + } + if s.Content != "" { + ys.Content = s.Content + } + result = append(result, ys) + } + return result +} diff --git a/parser/url.go b/parser/url.go new file mode 100644 index 0000000..30b862a --- /dev/null +++ b/parser/url.go @@ -0,0 +1,787 @@ +package parser + +import ( + "context" + "fmt" + "math" + "os" + "os/exec" + "path/filepath" + "runtime" + "sort" + "strings" + "time" + + "github.com/klippa-app/go-pdfium/requests" + "github.com/klippa-app/go-pdfium/responses" + "github.com/klippa-app/go-pdfium/webassembly" +) + +// charInfo holds per-character data extracted from PDF +type charInfo struct { + text string + x float64 + y float64 + fontSize float64 + isBold bool + fontName string +} + +// textLine represents a line of text extracted from PDF with font metadata +type textLine struct { + Text string + Y float64 + FontSize float64 + IsBold bool + FontName string +} + +// headingInfo represents a detected heading with its position and level +type headingInfo struct { + LineIdx int + Level int +} + +// ParseURL fetches a URL via headless Chrome, converts to PDF, and extracts sections. +func ParseURL(url string) (*Document, error) { + chromePath, err := findChrome() + if err != nil { + return nil, fmt.Errorf("chrome not found: %w\n\nInstall Chrome or set CHROME_PATH environment variable", err) + } + + tmpDir, err := os.MkdirTemp("", "docmap-*") + if err != nil { + return nil, fmt.Errorf("failed to create temp dir: %w", err) + } + defer os.RemoveAll(tmpDir) + + pdfPath := filepath.Join(tmpDir, "page.pdf") + if err := urlToPDF(chromePath, url, pdfPath); err != nil { + return nil, fmt.Errorf("failed to generate PDF: %w", err) + } + + lines, err := extractSpatialText(pdfPath) + if err != nil { + return nil, fmt.Errorf("failed to extract text: %w", err) + } + + if len(lines) == 0 { + return &Document{Sections: []*Section{{ + Level: 1, + Title: "(no extractable text)", + }}}, nil + } + + headings := detectHeadings(lines) + doc := &Document{} + doc.Sections = buildSectionsFromLines(lines, headings) + + for _, s := range doc.GetAllSections() { + doc.TotalTokens += s.Tokens + } + + return doc, nil +} + +// findChrome locates the Chrome/Chromium binary on the system. +func findChrome() (string, error) { + if p := os.Getenv("CHROME_PATH"); p != "" { + if _, err := os.Stat(p); err == nil { + return p, nil + } + return "", fmt.Errorf("CHROME_PATH set but not found: %s", p) + } + + switch runtime.GOOS { + case "darwin": + paths := []string{ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + } + for _, p := range paths { + if _, err := os.Stat(p); err == nil { + return p, nil + } + } + case "linux": + names := []string{"google-chrome", "google-chrome-stable", "chromium-browser", "chromium"} + for _, name := range names { + if p, err := exec.LookPath(name); err == nil { + return p, nil + } + } + case "windows": + paths := []string{ + filepath.Join(os.Getenv("PROGRAMFILES"), "Google", "Chrome", "Application", "chrome.exe"), + filepath.Join(os.Getenv("PROGRAMFILES(X86)"), "Google", "Chrome", "Application", "chrome.exe"), + filepath.Join(os.Getenv("LOCALAPPDATA"), "Google", "Chrome", "Application", "chrome.exe"), + } + for _, p := range paths { + if _, err := os.Stat(p); err == nil { + return p, nil + } + } + } + + return "", fmt.Errorf("Chrome/Chromium not found") +} + +// urlToPDF uses headless Chrome to render a URL to PDF. +func urlToPDF(chromePath, url, outPath string) error { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, chromePath, + "--headless", + "--disable-gpu", + "--no-pdf-header-footer", + "--print-to-pdf="+outPath, + url, + ) + cmd.Stderr = nil + cmd.Stdout = nil + + if err := cmd.Run(); err != nil { + if ctx.Err() == context.DeadlineExceeded { + return fmt.Errorf("timed out after 30s rendering %s", url) + } + return fmt.Errorf("chrome failed: %w", err) + } + + if _, err := os.Stat(outPath); err != nil { + return fmt.Errorf("PDF was not generated") + } + + return nil +} + +// extractSpatialText extracts text lines with font metadata from a PDF. +// Uses pdftotext (poppler) for clean text when available, with go-pdfium providing +// font size data for heading detection. Falls back to go-pdfium rects if pdftotext +// is not installed. +func extractSpatialText(pdfPath string) ([]textLine, error) { + // Try pdftotext + go-pdfium hybrid first + plainLines, ptErr := extractWithPdftotext(pdfPath) + if ptErr == nil { + // Get heading candidates from go-pdfium rect data + headingCandidates, bodySize, err := extractHeadingCandidates(pdfPath) + if err == nil && len(headingCandidates) > 0 { + return buildLinesWithHeadings(plainLines, headingCandidates, bodySize), nil + } + // If pdfium fails, return pdftotext lines without heading info + var result []textLine + for _, line := range plainLines { + result = append(result, textLine{Text: line, FontSize: 12}) + } + return result, nil + } + + // Fall back to go-pdfium rect-based extraction + return extractWithPdfium(pdfPath) +} + +// headingCandidate is a heading detected from go-pdfium with its normalized text and font size. +type headingCandidate struct { + normalizedText string + fontSize float64 +} + +// extractHeadingCandidates runs go-pdfium rect extraction and heading detection, +// returning normalized heading text + font sizes and the body font size. +func extractHeadingCandidates(pdfPath string) ([]headingCandidate, float64, error) { + rectLines, err := extractWithPdfium(pdfPath) + if err != nil { + return nil, 0, err + } + + headings := detectHeadings(rectLines) + if len(headings) == 0 { + return nil, 0, nil + } + + // Determine body size + sizeCount := make(map[float64]int) + for _, l := range rectLines { + rounded := math.Round(l.FontSize*2) / 2 + sizeCount[rounded] += len(l.Text) + } + var bodySize float64 + var maxCount int + for size, count := range sizeCount { + if count > maxCount { + maxCount = count + bodySize = size + } + } + + var candidates []headingCandidate + for _, h := range headings { + norm := normalizeForMatch(rectLines[h.LineIdx].Text) + // Skip very short heading candidates — they cause false matches + if len(norm) < 4 { + continue + } + candidates = append(candidates, headingCandidate{ + normalizedText: norm, + fontSize: rectLines[h.LineIdx].FontSize, + }) + } + + return candidates, bodySize, nil +} + +// normalizeForMatch strips spaces/punctuation and lowercases for fuzzy matching. +func normalizeForMatch(s string) string { + var b strings.Builder + for _, r := range strings.ToLower(s) { + if r >= 'a' && r <= 'z' || r >= '0' && r <= '9' { + b.WriteRune(r) + } + } + return b.String() +} + +// buildLinesWithHeadings assigns heading font sizes to pdftotext lines that match +// heading candidates extracted from go-pdfium. Uses character-overlap matching +// since go-pdfium text may have stutter prefixes (e.g. "VeVersioning" matches "Versioning"). +func buildLinesWithHeadings(plainLines []string, candidates []headingCandidate, bodySize float64) []textLine { + var result []textLine + + candidateIdx := 0 + for _, line := range plainLines { + fontSize := bodySize + normalized := normalizeForMatch(line) + + // Check if this line matches the next heading candidate + if candidateIdx < len(candidates) && len(normalized) > 0 { + candidate := candidates[candidateIdx] + if isHeadingMatch(normalized, candidate.normalizedText) { + fontSize = candidate.fontSize + candidateIdx++ + } + } + + result = append(result, textLine{ + Text: line, + FontSize: fontSize, + }) + } + + return result +} + +// isHeadingMatch checks if a clean pdftotext line matches a garbled go-pdfium heading. +// The pdfium text may have stutter prefixes ("veversioning" for "versioning"), so we check +// if the clean text is contained within the garbled text. +func isHeadingMatch(cleanNorm, garbledNorm string) bool { + if len(cleanNorm) < 4 || len(garbledNorm) < 4 { + return false + } + if cleanNorm == garbledNorm { + return true + } + // Clean text should be a substring of garbled (garbled has extra stutter chars) + if strings.Contains(garbledNorm, cleanNorm) { + return true + } + // Or garbled is a substring of clean + if strings.Contains(cleanNorm, garbledNorm) { + return true + } + // Check character overlap ratio — require high overlap + overlap := charOverlap(cleanNorm, garbledNorm) + shorter := len(cleanNorm) + if len(garbledNorm) < shorter { + shorter = len(garbledNorm) + } + return float64(overlap)/float64(shorter) > 0.8 +} + +// charOverlap counts matching characters between two strings using a simple LCS-like approach. +func charOverlap(a, b string) int { + count := 0 + bIdx := 0 + for _, ch := range a { + for bIdx < len(b) { + if rune(b[bIdx]) == ch { + count++ + bIdx++ + break + } + bIdx++ + } + } + return count +} + +// extractWithPdftotext shells out to pdftotext for clean text extraction. +func extractWithPdftotext(pdfPath string) ([]string, error) { + pdftotextPath, err := exec.LookPath("pdftotext") + if err != nil { + return nil, fmt.Errorf("pdftotext not found: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, pdftotextPath, pdfPath, "-") + output, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("pdftotext failed: %w", err) + } + + var lines []string + for _, line := range strings.Split(string(output), "\n") { + line = strings.TrimSpace(line) + if line != "" { + lines = append(lines, line) + } + } + return lines, nil +} + + +// extractWithPdfium falls back to go-pdfium rect-based extraction when pdftotext is unavailable. +func extractWithPdfium(pdfPath string) ([]textLine, error) { + pool, err := webassembly.Init(webassembly.Config{ + MinIdle: 1, MaxIdle: 1, MaxTotal: 1, + }) + if err != nil { + return nil, fmt.Errorf("failed to init pdfium: %w", err) + } + defer pool.Close() + + instance, err := pool.GetInstance(30 * time.Second) + if err != nil { + return nil, fmt.Errorf("failed to get pdfium instance: %w", err) + } + + pdfData, err := os.ReadFile(pdfPath) + if err != nil { + return nil, fmt.Errorf("failed to read PDF: %w", err) + } + + doc, err := instance.OpenDocument(&requests.OpenDocument{File: &pdfData}) + if err != nil { + return nil, fmt.Errorf("failed to open PDF: %w", err) + } + defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{Document: doc.Document}) + + pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) + if err != nil { + return nil, fmt.Errorf("failed to get page count: %w", err) + } + + var allLines []textLine + for i := 0; i < pageCount.PageCount; i++ { + textResult, err := instance.GetPageTextStructured(&requests.GetPageTextStructured{ + Page: requests.Page{ + ByIndex: &requests.PageByIndex{Document: doc.Document, Index: i}, + }, + Mode: requests.GetPageTextStructuredModeRects, + CollectFontInformation: true, + }) + if err != nil { + continue + } + pageLines := groupRectsIntoLines(textResult.Rects) + allLines = append(allLines, pageLines...) + } + return allLines, nil +} + +// rectInfo holds per-rect data extracted from PDF structured text +type rectInfo struct { + text string + x float64 + right float64 + y float64 + fontSize float64 + isBold bool + fontName string +} + +// groupRectsIntoLines groups pre-segmented text rects by Y-coordinate into lines. +func groupRectsIntoLines(rects []*responses.GetPageTextStructuredRect) []textLine { + if len(rects) == 0 { + return nil + } + + var infos []rectInfo + for _, r := range rects { + if strings.TrimSpace(r.Text) == "" { + continue + } + + ri := rectInfo{ + text: r.Text, // keep trailing spaces — they encode word boundaries + x: r.PointPosition.Left, + right: r.PointPosition.Right, + y: r.PointPosition.Top, + } + + if r.FontInformation != nil { + ri.fontSize = r.FontInformation.Size + ri.fontName = r.FontInformation.Name + ri.isBold = r.FontInformation.Flags&(1<<18) != 0 || + strings.Contains(strings.ToLower(r.FontInformation.Name), "bold") + } + + infos = append(infos, ri) + } + + if len(infos) == 0 { + return nil + } + + // Sort by Y (top-to-bottom), then X (left-to-right). + // Use generous Y tolerance since character baselines vary within a line. + sort.Slice(infos, func(i, j int) bool { + avgSize := (infos[i].fontSize + infos[j].fontSize) / 2 + yTol := avgSize * 0.4 + if yTol < 2.0 { + yTol = 2.0 + } + if math.Abs(infos[i].y-infos[j].y) <= yTol { + return infos[i].x < infos[j].x + } + return infos[i].y < infos[j].y + }) + + // Remove overlapping duplicate rects + infos = deduplicateOverlappingRects(infos) + + // Group rects into lines by Y-proximity. + // Use wider tolerance since individual character baselines vary (ascenders/descenders). + var lines []textLine + var currentGroup []rectInfo + var groupMinY, groupMaxY float64 + + for _, ri := range infos { + lineHeight := ri.fontSize * 0.5 + if lineHeight < 3.0 { + lineHeight = 3.0 + } + + if len(currentGroup) > 0 { + // Check if this rect belongs to the current line + withinLine := ri.y >= groupMinY-lineHeight && ri.y <= groupMaxY+lineHeight + if !withinLine { + lines = append(lines, mergeRectsToLine(currentGroup)) + currentGroup = nil + } + } + + currentGroup = append(currentGroup, ri) + if len(currentGroup) == 1 { + groupMinY = ri.y + groupMaxY = ri.y + } else { + if ri.y < groupMinY { + groupMinY = ri.y + } + if ri.y > groupMaxY { + groupMaxY = ri.y + } + } + } + + if len(currentGroup) > 0 { + lines = append(lines, mergeRectsToLine(currentGroup)) + } + + return lines +} + +// deduplicateOverlappingRects removes shorter rects that overlap with longer ones. +// Chrome sometimes renders text twice: a short prefix then the full word at the same position. +func deduplicateOverlappingRects(rects []rectInfo) []rectInfo { + if len(rects) <= 1 { + return rects + } + + var result []rectInfo + + for i := 0; i < len(rects); i++ { + if i+1 < len(rects) { + curr := rects[i] + next := rects[i+1] + // If next rect starts inside current rect and has longer text, skip current + if next.x >= curr.x-1 && next.x < curr.right && + len(strings.TrimSpace(next.text)) > len(strings.TrimSpace(curr.text)) { + continue + } + } + result = append(result, rects[i]) + } + + return result +} + +// mergeRectsToLine merges a group of text rects on the same line into a single textLine. +// Rect text already includes trailing spaces for word boundaries, so we just concatenate. +func mergeRectsToLine(rects []rectInfo) textLine { + var text strings.Builder + var totalSize float64 + var boldCount int + var fontName string + + for _, r := range rects { + text.WriteString(r.text) + totalSize += r.fontSize + if r.isBold { + boldCount++ + } + if fontName == "" && r.fontName != "" { + fontName = r.fontName + } + } + + avgSize := totalSize / float64(len(rects)) + + return textLine{ + Text: strings.TrimSpace(text.String()), + Y: rects[0].y, + FontSize: avgSize, + IsBold: boldCount > len(rects)/2, + FontName: fontName, + } +} + +// groupCharsIntoLines groups characters by Y-coordinate into text lines. +func groupCharsIntoLines(chars []*responses.GetPageTextStructuredChar) []textLine { + if len(chars) == 0 { + return nil + } + + var infos []charInfo + for _, c := range chars { + if strings.TrimSpace(c.Text) == "" { + continue + } + + ci := charInfo{ + text: c.Text, + x: c.PointPosition.Left, + y: c.PointPosition.Top, + } + + if c.FontInformation != nil { + ci.fontSize = c.FontInformation.Size + ci.fontName = c.FontInformation.Name + // PDF spec 1.7 Section 5.7.1: bit 19 (0-indexed bit 18) = ForceBold + ci.isBold = c.FontInformation.Flags&(1<<18) != 0 || + strings.Contains(strings.ToLower(c.FontInformation.Name), "bold") + } + + infos = append(infos, ci) + } + + if len(infos) == 0 { + return nil + } + + // Sort by Y (top-to-bottom), then X (left-to-right) + sort.Slice(infos, func(i, j int) bool { + if math.Abs(infos[i].y-infos[j].y) < 1.0 { + return infos[i].x < infos[j].x + } + return infos[i].y < infos[j].y + }) + + // Group into lines by Y-proximity + var lines []textLine + var currentLine []charInfo + currentY := infos[0].y + + for _, ci := range infos { + tolerance := ci.fontSize * 0.3 + if tolerance < 1.0 { + tolerance = 1.0 + } + + if math.Abs(ci.y-currentY) > tolerance && len(currentLine) > 0 { + lines = append(lines, mergeCharsToLine(currentLine)) + currentLine = nil + currentY = ci.y + } + + currentLine = append(currentLine, ci) + if len(currentLine) == 1 { + currentY = ci.y + } + } + + if len(currentLine) > 0 { + lines = append(lines, mergeCharsToLine(currentLine)) + } + + return lines +} + +// mergeCharsToLine merges a group of characters into a single textLine. +func mergeCharsToLine(chars []charInfo) textLine { + var text strings.Builder + var totalSize float64 + var boldCount int + var fontName string + + for i, c := range chars { + if i > 0 { + gap := c.x - chars[i-1].x + charWidth := chars[i-1].fontSize * 0.6 + if charWidth < 1 { + charWidth = 5 + } + if gap > charWidth*1.5 { + text.WriteString(" ") + } + } + text.WriteString(c.text) + totalSize += c.fontSize + if c.isBold { + boldCount++ + } + if fontName == "" && c.fontName != "" { + fontName = c.fontName + } + } + + avgSize := totalSize / float64(len(chars)) + + return textLine{ + Text: text.String(), + Y: chars[0].y, + FontSize: avgSize, + IsBold: boldCount > len(chars)/2, + FontName: fontName, + } +} + +// detectHeadings identifies heading lines based on font size distribution. +// The most frequent font size is body text; larger sizes are headings. +func detectHeadings(lines []textLine) []headingInfo { + if len(lines) == 0 { + return nil + } + + // Build font size histogram (round to nearest 0.5pt for clustering) + sizeCount := make(map[float64]int) + for _, l := range lines { + rounded := math.Round(l.FontSize*2) / 2 + sizeCount[rounded] += len(l.Text) + } + + // Find body size (most frequent by character count) + var bodySize float64 + var maxCount int + for size, count := range sizeCount { + if count > maxCount { + maxCount = count + bodySize = size + } + } + + // Collect distinct heading sizes (larger than body) + headingSizeSet := make(map[float64]bool) + for size := range sizeCount { + if size > bodySize+0.5 { + headingSizeSet[size] = true + } + } + + if len(headingSizeSet) == 0 { + return nil + } + + // Sort heading sizes descending → largest = level 1 + var headingSizes []float64 + for size := range headingSizeSet { + headingSizes = append(headingSizes, size) + } + sort.Sort(sort.Reverse(sort.Float64Slice(headingSizes))) + + sizeToLevel := make(map[float64]int) + for i, size := range headingSizes { + sizeToLevel[size] = i + 1 + } + + // Map lines to headings + var headings []headingInfo + for i, l := range lines { + rounded := math.Round(l.FontSize*2) / 2 + if level, ok := sizeToLevel[rounded]; ok { + headings = append(headings, headingInfo{ + LineIdx: i, + Level: level, + }) + } + } + + return headings +} + +// buildSectionsFromLines constructs a section tree from text lines and detected headings. +func buildSectionsFromLines(lines []textLine, headings []headingInfo) []*Section { + if len(lines) == 0 { + return nil + } + + // Build a set of heading line indices for quick lookup + headingMap := make(map[int]int) // lineIdx → level + for _, h := range headings { + headingMap[h.LineIdx] = h.Level + } + + // If no headings detected, create a single section with all content + if len(headings) == 0 { + var content strings.Builder + for _, l := range lines { + content.WriteString(l.Text) + content.WriteString("\n") + } + text := strings.TrimSpace(content.String()) + return []*Section{{ + Level: 1, + Title: truncateTitle(lines[0].Text), + Content: text, + Tokens: estimateTokens(text), + }} + } + + // Walk lines, creating sections at heading boundaries + var allSections []*Section + var currentSection *Section + var contentBuilder strings.Builder + + for i, l := range lines { + if level, isHeading := headingMap[i]; isHeading { + // Finalize previous section + if currentSection != nil { + currentSection.Content = strings.TrimSpace(contentBuilder.String()) + currentSection.Tokens = estimateTokens(currentSection.Content) + } + + currentSection = &Section{ + Level: level, + Title: strings.TrimSpace(l.Text), + } + allSections = append(allSections, currentSection) + contentBuilder.Reset() + } else if currentSection != nil { + contentBuilder.WriteString(l.Text) + contentBuilder.WriteString("\n") + } + // Lines before the first heading are dropped (usually nav/header chrome) + } + + // Finalize last section + if currentSection != nil { + currentSection.Content = strings.TrimSpace(contentBuilder.String()) + currentSection.Tokens = estimateTokens(currentSection.Content) + } + + // Build tree and calculate cumulative tokens + roots := buildTree(allSections) + return roots +} diff --git a/parser/url_test.go b/parser/url_test.go new file mode 100644 index 0000000..863bb0c --- /dev/null +++ b/parser/url_test.go @@ -0,0 +1,355 @@ +package parser + +import ( + "runtime" + "testing" +) + +func TestFindChrome(t *testing.T) { + path, err := findChrome() + if err != nil { + // Chrome not installed is okay for CI — just verify the error is clear + t.Logf("Chrome not found (expected in CI): %v", err) + return + } + if path == "" { + t.Error("findChrome returned empty path with no error") + } + t.Logf("Found Chrome at: %s", path) +} + +func TestFindChromeEnvOverride(t *testing.T) { + t.Setenv("CHROME_PATH", "/nonexistent/chrome") + _, err := findChrome() + if err == nil { + t.Error("expected error for nonexistent CHROME_PATH") + } +} + +func TestFindChromePlatformPaths(t *testing.T) { + // Verify findChrome checks platform-appropriate paths + switch runtime.GOOS { + case "darwin", "linux", "windows": + // Just verify it doesn't panic + findChrome() + default: + t.Logf("Skipping platform test for %s", runtime.GOOS) + } +} + +func TestDetectHeadingsBasic(t *testing.T) { + lines := []textLine{ + {Text: "Main Title", FontSize: 24.0, IsBold: true}, + {Text: "Some body text here.", FontSize: 12.0}, + {Text: "More body text.", FontSize: 12.0}, + {Text: "Subtitle", FontSize: 18.0, IsBold: true}, + {Text: "Body under subtitle.", FontSize: 12.0}, + {Text: "Another body line.", FontSize: 12.0}, + } + + headings := detectHeadings(lines) + if len(headings) != 2 { + t.Fatalf("expected 2 headings, got %d", len(headings)) + } + + // "Main Title" at 24pt should be level 1 + if headings[0].LineIdx != 0 { + t.Errorf("expected first heading at line 0, got %d", headings[0].LineIdx) + } + if headings[0].Level != 1 { + t.Errorf("expected level 1 for Main Title, got %d", headings[0].Level) + } + + // "Subtitle" at 18pt should be level 2 + if headings[1].LineIdx != 3 { + t.Errorf("expected second heading at line 3, got %d", headings[1].LineIdx) + } + if headings[1].Level != 2 { + t.Errorf("expected level 2 for Subtitle, got %d", headings[1].Level) + } +} + +func TestDetectHeadingsThreeLevels(t *testing.T) { + lines := []textLine{ + {Text: "H1", FontSize: 28.0}, + {Text: "body", FontSize: 12.0}, + {Text: "body", FontSize: 12.0}, + {Text: "body", FontSize: 12.0}, + {Text: "H2", FontSize: 20.0}, + {Text: "body", FontSize: 12.0}, + {Text: "H3", FontSize: 16.0}, + {Text: "body", FontSize: 12.0}, + } + + headings := detectHeadings(lines) + if len(headings) != 3 { + t.Fatalf("expected 3 headings, got %d", len(headings)) + } + + if headings[0].Level != 1 { + t.Errorf("expected H1 level 1, got %d", headings[0].Level) + } + if headings[1].Level != 2 { + t.Errorf("expected H2 level 2, got %d", headings[1].Level) + } + if headings[2].Level != 3 { + t.Errorf("expected H3 level 3, got %d", headings[2].Level) + } +} + +func TestDetectHeadingsNoHeadings(t *testing.T) { + lines := []textLine{ + {Text: "All same size.", FontSize: 12.0}, + {Text: "Still same size.", FontSize: 12.0}, + {Text: "Yep same size.", FontSize: 12.0}, + } + + headings := detectHeadings(lines) + if len(headings) != 0 { + t.Errorf("expected 0 headings for uniform text, got %d", len(headings)) + } +} + +func TestDetectHeadingsEmpty(t *testing.T) { + headings := detectHeadings(nil) + if headings != nil { + t.Errorf("expected nil for empty input, got %v", headings) + } +} + +func TestDetectHeadingsBodySizeByCharCount(t *testing.T) { + // Body text has more total characters even if heading lines are numerous + lines := []textLine{ + {Text: "Title", FontSize: 24.0}, + {Text: "This is a much longer body text paragraph with many words in it.", FontSize: 12.0}, + {Text: "Another long paragraph of body text that contains a lot of content.", FontSize: 12.0}, + } + + headings := detectHeadings(lines) + if len(headings) != 1 { + t.Fatalf("expected 1 heading, got %d", len(headings)) + } + if headings[0].LineIdx != 0 { + t.Errorf("expected heading at line 0, got %d", headings[0].LineIdx) + } +} + +func TestBuildSectionsFromLinesBasic(t *testing.T) { + lines := []textLine{ + {Text: "Introduction", FontSize: 24.0}, + {Text: "Welcome to our docs.", FontSize: 12.0}, + {Text: "Getting Started", FontSize: 24.0}, + {Text: "Install the package.", FontSize: 12.0}, + {Text: "Run the setup command.", FontSize: 12.0}, + } + + headings := []headingInfo{ + {LineIdx: 0, Level: 1}, + {LineIdx: 2, Level: 1}, + } + + sections := buildSectionsFromLines(lines, headings) + if len(sections) != 2 { + t.Fatalf("expected 2 sections, got %d", len(sections)) + } + + if sections[0].Title != "Introduction" { + t.Errorf("expected 'Introduction', got '%s'", sections[0].Title) + } + if sections[0].Content != "Welcome to our docs." { + t.Errorf("expected body content, got '%s'", sections[0].Content) + } + + if sections[1].Title != "Getting Started" { + t.Errorf("expected 'Getting Started', got '%s'", sections[1].Title) + } + if sections[1].Tokens == 0 { + t.Error("expected non-zero tokens for section with content") + } +} + +func TestBuildSectionsFromLinesNested(t *testing.T) { + lines := []textLine{ + {Text: "Chapter 1", FontSize: 24.0}, + {Text: "Chapter intro.", FontSize: 12.0}, + {Text: "Section 1.1", FontSize: 18.0}, + {Text: "Section content.", FontSize: 12.0}, + } + + headings := []headingInfo{ + {LineIdx: 0, Level: 1}, + {LineIdx: 2, Level: 2}, + } + + sections := buildSectionsFromLines(lines, headings) + if len(sections) != 1 { + t.Fatalf("expected 1 root section, got %d", len(sections)) + } + + if sections[0].Title != "Chapter 1" { + t.Errorf("expected 'Chapter 1', got '%s'", sections[0].Title) + } + if len(sections[0].Children) != 1 { + t.Fatalf("expected 1 child section, got %d", len(sections[0].Children)) + } + if sections[0].Children[0].Title != "Section 1.1" { + t.Errorf("expected 'Section 1.1', got '%s'", sections[0].Children[0].Title) + } +} + +func TestBuildSectionsFromLinesNoHeadings(t *testing.T) { + lines := []textLine{ + {Text: "Just some text.", FontSize: 12.0}, + {Text: "More text here.", FontSize: 12.0}, + } + + sections := buildSectionsFromLines(lines, nil) + if len(sections) != 1 { + t.Fatalf("expected 1 fallback section, got %d", len(sections)) + } + if sections[0].Title != "Just some text." { + t.Errorf("expected first line as title, got '%s'", sections[0].Title) + } +} + +func TestBuildSectionsFromLinesEmpty(t *testing.T) { + sections := buildSectionsFromLines(nil, nil) + if sections != nil { + t.Errorf("expected nil for empty input, got %v", sections) + } +} + +func TestBuildSectionsFromLinesPreHeadingContent(t *testing.T) { + // Lines before the first heading should be dropped (nav/header noise) + lines := []textLine{ + {Text: "Nav link 1", FontSize: 10.0}, + {Text: "Nav link 2", FontSize: 10.0}, + {Text: "Real Title", FontSize: 24.0}, + {Text: "Real content.", FontSize: 12.0}, + } + + headings := []headingInfo{ + {LineIdx: 2, Level: 1}, + } + + sections := buildSectionsFromLines(lines, headings) + if len(sections) != 1 { + t.Fatalf("expected 1 section, got %d", len(sections)) + } + if sections[0].Title != "Real Title" { + t.Errorf("expected 'Real Title', got '%s'", sections[0].Title) + } +} + +func TestMergeCharsToLine(t *testing.T) { + chars := []charInfo{ + {text: "H", x: 10, fontSize: 24, isBold: true, fontName: "Arial-Bold"}, + {text: "e", x: 22, fontSize: 24, isBold: true, fontName: "Arial-Bold"}, + {text: "l", x: 34, fontSize: 24, isBold: true, fontName: "Arial-Bold"}, + {text: "l", x: 46, fontSize: 24, isBold: true, fontName: "Arial-Bold"}, + {text: "o", x: 58, fontSize: 24, isBold: true, fontName: "Arial-Bold"}, + } + + line := mergeCharsToLine(chars) + if line.Text != "Hello" { + t.Errorf("expected 'Hello', got '%s'", line.Text) + } + if !line.IsBold { + t.Error("expected bold line") + } + if line.FontSize != 24.0 { + t.Errorf("expected font size 24.0, got %f", line.FontSize) + } + if line.FontName != "Arial-Bold" { + t.Errorf("expected font name 'Arial-Bold', got '%s'", line.FontName) + } +} + +func TestMergeCharsToLineWithSpaces(t *testing.T) { + // Characters with a large gap should produce a space + chars := []charInfo{ + {text: "A", x: 10, fontSize: 12, fontName: "Arial"}, + {text: "B", x: 17, fontSize: 12, fontName: "Arial"}, // close (gap 7 < 10.8) + {text: "C", x: 80, fontSize: 12, fontName: "Arial"}, // far away + } + + line := mergeCharsToLine(chars) + if line.Text != "AB C" { + t.Errorf("expected 'AB C', got '%s'", line.Text) + } +} + +func TestExportYAMLRoundTrip(t *testing.T) { + doc := &Document{ + Filename: "test.md", + TotalTokens: 100, + Sections: []*Section{ + { + Level: 1, + Title: "Introduction", + Content: "Welcome to the docs.", + Tokens: 50, + Children: []*Section{ + { + Level: 2, + Title: "Getting Started", + Content: "Install the package.", + Tokens: 25, + }, + }, + }, + { + Level: 1, + Title: "API Reference", + Content: "Endpoint docs.", + Tokens: 25, + }, + }, + } + + yamlContent, err := ExportYAML(doc) + if err != nil { + t.Fatalf("ExportYAML failed: %v", err) + } + + if yamlContent == "" { + t.Fatal("expected non-empty YAML output") + } + + // Parse it back + parsed, err := ParseYAML(yamlContent) + if err != nil { + t.Fatalf("ParseYAML of exported content failed: %v", err) + } + + // Verify structure was preserved + if len(parsed.Sections) == 0 { + t.Fatal("expected sections in round-tripped document") + } + + // Find the "sections" key and verify it has children + var sectionsNode *Section + for _, s := range parsed.Sections { + if s.Title == "sections" { + sectionsNode = s + break + } + } + if sectionsNode == nil { + t.Fatal("expected 'sections' key in parsed YAML") + } + if len(sectionsNode.Children) != 2 { + t.Errorf("expected 2 section children, got %d", len(sectionsNode.Children)) + } +} + +func TestExportYAMLEmpty(t *testing.T) { + doc := &Document{} + yamlContent, err := ExportYAML(doc) + if err != nil { + t.Fatalf("ExportYAML failed on empty doc: %v", err) + } + if yamlContent == "" { + t.Fatal("expected non-empty YAML even for empty doc") + } +} From fc593ffd64109db16b25d554650107621813a37d Mon Sep 17 00:00:00 2001 From: Jordan Coin Jackson Date: Sun, 22 Feb 2026 00:16:49 -0500 Subject: [PATCH 2/2] Add HTML-first approach for URL parsing Try parsing semantic HTML headings (h1-h6) before falling back to Chrome/PDF pipeline. Works instantly on SSR doc sites (Mintlify, Docusaurus, etc.) with perfect heading detection. Chrome/PDF remains as fallback for JS-only SPAs. Co-Authored-By: Claude Opus 4.6 --- parser/html.go | 244 ++++++++++++++++++++++++++++++++++++++++++++ parser/html_test.go | 216 +++++++++++++++++++++++++++++++++++++++ parser/url.go | 15 ++- 3 files changed, 474 insertions(+), 1 deletion(-) create mode 100644 parser/html.go create mode 100644 parser/html_test.go diff --git a/parser/html.go b/parser/html.go new file mode 100644 index 0000000..cda60b0 --- /dev/null +++ b/parser/html.go @@ -0,0 +1,244 @@ +package parser + +import ( + "fmt" + "io" + "net/http" + "strings" + "time" + + "golang.org/x/net/html" +) + +// htmlSection is an intermediate representation of a section extracted from HTML. +type htmlSection struct { + level int + title string + content strings.Builder +} + +// parseHTMLFromURL fetches a URL and tries to extract sections from semantic HTML headings. +// Returns nil, nil if the HTML has no usable heading structure (e.g. JS-only SPA). +func parseHTMLFromURL(url string) (*Document, error) { + client := &http.Client{Timeout: 15 * time.Second} + resp, err := client.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to fetch URL: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("HTTP %d fetching %s", resp.StatusCode, url) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + return parseHTMLContent(string(body)) +} + +// parseHTMLContent extracts document sections from HTML content using semantic heading tags. +// Returns nil, nil if no usable heading structure is found. +func parseHTMLContent(htmlContent string) (*Document, error) { + node, err := html.Parse(strings.NewReader(htmlContent)) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + // Find the best content root — prefer
,
, or element with role="main" + contentRoot := findContentRoot(node) + if contentRoot == nil { + contentRoot = findBodyElement(node) + } + if contentRoot == nil { + return nil, nil + } + + // Walk the DOM and extract headings + body text in document order + var sections []htmlSection + var current *htmlSection + + var walk func(*html.Node) + walk = func(n *html.Node) { + // Skip nav, header, footer, sidebar elements + if shouldSkipElement(n) { + return + } + + if n.Type == html.ElementNode { + level := headingLevel(n) + if level > 0 { + title := extractText(n) + title = strings.TrimSpace(title) + if title == "" { + // Skip empty headings + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + return + } + + // Finalize previous section + if current != nil { + sections = append(sections, *current) + } + + current = &htmlSection{level: level, title: title} + return // Don't recurse into heading children (already extracted text) + } + } + + if n.Type == html.TextNode && current != nil { + text := strings.TrimSpace(n.Data) + if text != "" { + current.content.WriteString(text) + current.content.WriteString(" ") + } + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + + walk(contentRoot) + + // Finalize last section + if current != nil { + sections = append(sections, *current) + } + + // No headings found — HTML is probably a JS SPA shell + if len(sections) == 0 { + return nil, nil + } + + // Convert to Section structs + var allSections []*Section + for _, hs := range sections { + content := strings.TrimSpace(hs.content.String()) + s := &Section{ + Level: hs.level, + Title: hs.title, + Content: content, + Tokens: estimateTokens(content), + } + allSections = append(allSections, s) + } + + doc := &Document{ + Sections: buildTree(allSections), + } + for _, s := range allSections { + doc.TotalTokens += s.Tokens + } + + return doc, nil +} + +// headingLevel returns the heading level (1-6) for h1-h6 elements, or 0 for non-headings. +func headingLevel(n *html.Node) int { + if n.Type != html.ElementNode { + return 0 + } + switch n.Data { + case "h1": + return 1 + case "h2": + return 2 + case "h3": + return 3 + case "h4": + return 4 + case "h5": + return 5 + case "h6": + return 6 + } + return 0 +} + +// shouldSkipElement returns true for elements that typically contain navigation/chrome, not content. +func shouldSkipElement(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + // Skip nav, header, footer elements + switch n.Data { + case "nav", "footer", "noscript", "script", "style", "svg", "iframe": + return true + } + + for _, attr := range n.Attr { + val := strings.ToLower(attr.Val) + + // Skip elements with sidebar/nav roles or IDs + if attr.Key == "role" && (val == "navigation" || val == "banner" || val == "contentinfo") { + return true + } + if attr.Key == "id" && (val == "sidebar-title" || strings.Contains(val, "sidebar") || strings.Contains(val, "nav")) { + return true + } + // Skip elements with common nav/sidebar classes + if attr.Key == "class" { + if strings.Contains(val, "sidebar") || strings.Contains(val, "nav-") || + strings.Contains(val, "navigation") || strings.Contains(val, "toc") { + return true + } + } + // Skip hidden elements + if attr.Key == "hidden" || (attr.Key == "aria-hidden" && val == "true") { + return true + } + } + + return false +} + +// findContentRoot looks for a
,
, or element with role="main". +func findContentRoot(n *html.Node) *html.Node { + if n.Type == html.ElementNode { + if n.Data == "main" || n.Data == "article" { + return n + } + for _, attr := range n.Attr { + if attr.Key == "role" && attr.Val == "main" { + return n + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if found := findContentRoot(c); found != nil { + return found + } + } + return nil +} + +// findBodyElement returns the element, or nil. +func findBodyElement(n *html.Node) *html.Node { + if n.Type == html.ElementNode && n.Data == "body" { + return n + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if found := findBodyElement(c); found != nil { + return found + } + } + return nil +} + +// extractText recursively extracts all text content from a node. +func extractText(n *html.Node) string { + if n.Type == html.TextNode { + return n.Data + } + var b strings.Builder + for c := n.FirstChild; c != nil; c = c.NextSibling { + b.WriteString(extractText(c)) + } + return b.String() +} diff --git a/parser/html_test.go b/parser/html_test.go new file mode 100644 index 0000000..2ab95d9 --- /dev/null +++ b/parser/html_test.go @@ -0,0 +1,216 @@ +package parser + +import ( + "testing" +) + +func TestParseHTMLContentBasic(t *testing.T) { + html := ` +

Getting Started

+

Welcome to the docs.

+

Installation

+

Run npm install.

+

Configuration

+

Edit the config file.

+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + if len(doc.Sections) != 1 { + t.Fatalf("expected 1 root section, got %d", len(doc.Sections)) + } + if doc.Sections[0].Title != "Getting Started" { + t.Errorf("expected 'Getting Started', got '%s'", doc.Sections[0].Title) + } + if len(doc.Sections[0].Children) != 2 { + t.Fatalf("expected 2 children, got %d", len(doc.Sections[0].Children)) + } + if doc.Sections[0].Children[0].Title != "Installation" { + t.Errorf("expected 'Installation', got '%s'", doc.Sections[0].Children[0].Title) + } + if doc.Sections[0].Children[1].Title != "Configuration" { + t.Errorf("expected 'Configuration', got '%s'", doc.Sections[0].Children[1].Title) + } +} + +func TestParseHTMLContentSkipsNav(t *testing.T) { + html := ` + +

Main Content

+

Real content here.

+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + if len(doc.Sections) != 1 { + t.Fatalf("expected 1 section (nav skipped), got %d", len(doc.Sections)) + } + if doc.Sections[0].Title != "Main Content" { + t.Errorf("expected 'Main Content', got '%s'", doc.Sections[0].Title) + } +} + +func TestParseHTMLContentSkipsSidebar(t *testing.T) { + html := ` + +

Page Title

+

Body text.

+
Footer Heading
+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + if len(doc.Sections) != 1 { + t.Fatalf("expected 1 section, got %d", len(doc.Sections)) + } + if doc.Sections[0].Title != "Page Title" { + t.Errorf("expected 'Page Title', got '%s'", doc.Sections[0].Title) + } +} + +func TestParseHTMLContentNoHeadings(t *testing.T) { + html := `

Just a paragraph.

No headings here.

` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc != nil { + t.Errorf("expected nil for no headings, got %+v", doc) + } +} + +func TestParseHTMLContentEmpty(t *testing.T) { + doc, err := parseHTMLContent("") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc != nil { + t.Errorf("expected nil for empty HTML, got %+v", doc) + } +} + +func TestParseHTMLContentThreeLevels(t *testing.T) { + html := ` +

Chapter 1

+

Chapter intro.

+

Section 1.1

+

Section content.

+

Subsection 1.1.1

+

Detail content.

+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + if len(doc.Sections) != 1 { + t.Fatalf("expected 1 root, got %d", len(doc.Sections)) + } + root := doc.Sections[0] + if root.Title != "Chapter 1" { + t.Errorf("expected 'Chapter 1', got '%s'", root.Title) + } + if len(root.Children) != 1 { + t.Fatalf("expected 1 child of root, got %d", len(root.Children)) + } + child := root.Children[0] + if child.Title != "Section 1.1" { + t.Errorf("expected 'Section 1.1', got '%s'", child.Title) + } + if len(child.Children) != 1 { + t.Fatalf("expected 1 grandchild, got %d", len(child.Children)) + } + if child.Children[0].Title != "Subsection 1.1.1" { + t.Errorf("expected 'Subsection 1.1.1', got '%s'", child.Children[0].Title) + } +} + +func TestParseHTMLContentExtractsBodyText(t *testing.T) { + html := ` +

Title

+

First paragraph.

+

Second paragraph.

+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + content := doc.Sections[0].Content + if content == "" { + t.Error("expected non-empty content") + } + if doc.Sections[0].Tokens == 0 { + t.Error("expected non-zero tokens") + } +} + +func TestParseHTMLContentPrefersMainElement(t *testing.T) { + html := ` +

Site Header

+
+

Page Title

+

Content.

+
+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + if len(doc.Sections) != 1 { + t.Fatalf("expected 1 section, got %d", len(doc.Sections)) + } + // Should pick up "Page Title" from
, not "Site Header" from
+ if doc.Sections[0].Title != "Page Title" { + t.Errorf("expected 'Page Title', got '%s'", doc.Sections[0].Title) + } +} + +func TestParseHTMLContentSkipsScriptStyle(t *testing.T) { + html := ` + + +

Real Title

+

Content here.

+ ` + + doc, err := parseHTMLContent(html) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc == nil { + t.Fatal("expected document, got nil") + } + if len(doc.Sections) != 1 { + t.Fatalf("expected 1 section, got %d", len(doc.Sections)) + } + if doc.Sections[0].Title != "Real Title" { + t.Errorf("expected 'Real Title', got '%s'", doc.Sections[0].Title) + } +} diff --git a/parser/url.go b/parser/url.go index 30b862a..5465b7b 100644 --- a/parser/url.go +++ b/parser/url.go @@ -42,8 +42,21 @@ type headingInfo struct { Level int } -// ParseURL fetches a URL via headless Chrome, converts to PDF, and extracts sections. +// ParseURL fetches a URL and extracts document sections. +// Tries HTML parsing first (fast, accurate for SSR sites), falls back to Chrome/PDF. func ParseURL(url string) (*Document, error) { + // Try HTML approach first — works for SSR sites (most doc sites) + doc, err := parseHTMLFromURL(url) + if err == nil && doc != nil && len(doc.Sections) > 0 { + return doc, nil + } + + // Fall back to Chrome → PDF → text extraction + return parseURLviaPDF(url) +} + +// parseURLviaPDF uses headless Chrome to render a URL to PDF, then extracts sections. +func parseURLviaPDF(url string) (*Document, error) { chromePath, err := findChrome() if err != nil { return nil, fmt.Errorf("chrome not found: %w\n\nInstall Chrome or set CHROME_PATH environment variable", err)