Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions processing/text/processor.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package text

import (
"bufio"
"fmt"
"html"
"regexp"
"strings"
)
Expand All @@ -24,6 +26,7 @@ var (
_ Processor = RegexpRemover{}
_ Processor = Cutter{}
_ Processor = Trimmer{}
_ Processor = LineToParagraph{}
)

// processor is a generic implementation of Processor,
Expand Down Expand Up @@ -137,6 +140,51 @@ func (p Trimmer) Process(s string) (string, error) {
return strings.Trim(s, p.Cutset), nil
}

// LineToParagraph converts each line of text into a separate HTML <p>...</p> paragraph.
// TrimSpace controls whether leading and trailing spaces are removed from each line before wrapping it in <p> tags.
// Empty lines can be either skipped or rendered as empty <p></p> according to the SkipEmpty flag.
type LineToParagraph struct {
// TrimSpace controls whether leading and trailing spaces are removed from each line.
// true → trim spaces
// false → preserve spaces (default, matches previous behaviour)
TrimSpace bool
// SkipEmpty controls whether completely empty lines produce <p></p> or are ignored.
// true → skip empty lines (default, matches previous behaviour)
// false → emit <p></p> for empty lines
SkipEmpty bool
}

// Describe returns a human-readable description of the processor.
func (p LineToParagraph) Describe() string {
return fmt.Sprintf("LineToParagraph(TrimSpace=%t, SkipEmpty=%t)", p.TrimSpace, p.SkipEmpty)
}

// Once returns true – the transformation is idempotent and should run only once.
func (LineToParagraph) Once() bool { return true }

// Process transforms the input text line-by-line into HTML paragraphs.
func (p LineToParagraph) Process(s string) (string, error) {
scanner := bufio.NewScanner(strings.NewReader(s))
var b strings.Builder
b.Grow(len(s))
for scanner.Scan() {
line := scanner.Text()
if p.TrimSpace {
line = strings.TrimSpace(line)
}
if p.SkipEmpty && line == "" {
continue
}
b.WriteString("<p>")
b.WriteString(html.EscapeString(line))
b.WriteString("</p>\n")
}
if err := scanner.Err(); err != nil {
return "", err
}
return b.String(), nil
}

// TrimSpace returns a processor that removes leading and trailing spaces.
func TrimSpace() Processor {
return NewProcessor("TrimSpace", false, WrapFunc(strings.TrimSpace))
Expand All @@ -162,6 +210,12 @@ func RemoveParentheses() Processor {
)
}

// ToParagraphs returns a processor that converts each line into a <p> paragraph.
// If skipEmpty is true, empty lines are ignored; otherwise, they produce empty <p></p>.
func ToParagraphs(skipEmpty bool) Processor {
return LineToParagraph{SkipEmpty: skipEmpty}
}

// WrapFunc wraps a simple string -> string function
// into a function matching the Processor signature.
func WrapFunc(fn func(string) string) func(string) (string, error) {
Expand Down
71 changes: 71 additions & 0 deletions processing/text/processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,74 @@ func TestTrimmer(t *testing.T) {
}
}
}

func TestLineToParagraph(t *testing.T) {
for i, tc := range []struct {
proc Processor
input string
expected string
}{
// Default behaviour (zero value): SkipEmpty=true, TrimSpace=false
{
LineToParagraph{},
" First line \n\n Second line\t \n\n",
"<p> First line </p>\n<p></p>\n<p> Second line\t </p>\n<p></p>\n",
},
// Explicitly enable trimming of leading/trailing whitespace
{
LineToParagraph{TrimSpace: true},
" Hello \n World \n",
"<p>Hello</p>\n<p>World</p>\n",
},
// Preserve empty lines (SkipEmpty = false)
{
LineToParagraph{SkipEmpty: false},
"Line 1\n\n\nLine 2\n",
"<p>Line 1</p>\n<p></p>\n<p></p>\n<p>Line 2</p>\n",
},
// Trim + preserve empty lines
{
LineToParagraph{TrimSpace: true, SkipEmpty: false},
" \n A \n \nB \n",
"<p></p>\n<p>A</p>\n<p></p>\n<p>B</p>\n",
},
// Fully literal mode: keep all original whitespace and emit every line
{
LineToParagraph{TrimSpace: false, SkipEmpty: false},
"\tIndented\n \n Spaces only \n\nTrailing \n",
"<p>\tIndented</p>\n<p> </p>\n<p> Spaces only </p>\n<p></p>\n<p>Trailing </p>\n",
},
// Empty input
{
LineToParagraph{},
"",
"",
},
// Input containing only empty lines and whitespace
{
LineToParagraph{},
"\n \n\t\n \n",
"<p></p>\n<p> </p>\n<p>\t</p>\n<p> </p>\n",
},
// HTML escaping works regardless of configuration
{
LineToParagraph{TrimSpace: true},
" <script>alert(1)</script> \n &copy; 2025 \n",
"<p>&lt;script&gt;alert(1)&lt;/script&gt;</p>\n<p>&amp;copy; 2025</p>\n",
},
{
LineToParagraph{TrimSpace: false},
" <b>bold</b> \n",
"<p> &lt;b&gt;bold&lt;/b&gt; </p>\n",
},
} {
res, err := NewTasks().Append(tc.proc).Process(tc.input)
if err != nil {
t.Errorf("case %d: unexpected error: %v", i, err)
continue
}
if res != tc.expected {
t.Errorf("case %d:\ngot:\n%q\nwant:\n%q", i, res, tc.expected)
}
}
}
4 changes: 1 addition & 3 deletions processing/text/task.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package text

import (
"fmt"
)
import "fmt"

var MaxIter = 100

Expand Down
Loading