Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions processing/text/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ var (
_ Processor = new(processor)
_ Processor = new(multiProcessor)
_ Processor = RegexpRemover{}
_ Processor = RegexpExtractor{}
_ Processor = Cutter{}
_ Processor = Trimmer{}
_ Processor = LineToParagraph{}
Expand Down Expand Up @@ -107,6 +108,23 @@ func (p RegexpRemover) Process(s string) (string, error) {
return p.Re.ReplaceAllString(s, ""), nil
}

// RegexpExtractor extracts the first substring that matches the given regular expression.
// If no match is found, it returns an empty string.
type RegexpExtractor struct {
Re *regexp.Regexp
}

// Describe returns a string representation of the RegexpExtractor.
func (p RegexpExtractor) Describe() string { return fmt.Sprintf("RegexpExtractor(%q)", p.Re.String()) }

// Once returns true, as extracting a specific part is a transformative operation usually done once.
func (RegexpExtractor) Once() bool { return true }

// Process finds the first match of the regular expression in the input string.
func (p RegexpExtractor) Process(s string) (string, error) {
return p.Re.FindString(s), nil
}

// Cutter splits the input by the given separator and keeps only the part before it.
type Cutter struct {
Sep string
Expand Down
62 changes: 62 additions & 0 deletions processing/text/processor_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package text

import (
"fmt"
"regexp"
"testing"
)
Expand All @@ -23,6 +24,67 @@ func TestRegexpRemover(t *testing.T) {
}
}

func TestRegexpExtractor(t *testing.T) {
tests := []struct {
name string
pattern string
input string
want string
}{
{
name: "Extract first number",
pattern: `\d+`,
input: "Order ID: 12345, Count: 67",
want: "12345",
},
{
name: "Extract email",
pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
input: "Please contact support@example.com for help.",
want: "support@example.com",
},
{
name: "No match found",
pattern: `\d+`,
input: "No numbers here",
want: "",
},
{
name: "Empty input",
pattern: `\d+`,
input: "",
want: "",
},
{
name: "Match start of string",
pattern: `^Hello`,
input: "Hello World",
want: "Hello",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
re := regexp.MustCompile(tt.pattern)
p := RegexpExtractor{Re: re}
expectedDesc := "RegexpExtractor(" + fmt.Sprintf("%q", re.String()) + ")"
if desc := p.Describe(); desc != expectedDesc {
t.Errorf("Describe() = %v, want %v", desc, expectedDesc)
}
if !p.Once() {
t.Error("Once() = false, want true")
}
got, err := p.Process(tt.input)
if err != nil {
t.Errorf("Process() error = %v, wantErr %v", err, nil)
return
}
if got != tt.want {
t.Errorf("Process() = %q, want %q", got, tt.want)
}
})
}
}

func TestCutter(t *testing.T) {
for i, testcase := range []struct {
seq string
Expand Down
Loading