From 60bb1aaa6580e2d77b6eb7ab66490826fcbcd3fd Mon Sep 17 00:00:00 2001 From: sunshineplan Date: Thu, 4 Dec 2025 09:44:48 +0800 Subject: [PATCH] text: Add RegexpExtractor processor and tests Introduces the RegexpExtractor type to extract the first substring matching a regular expression. Includes implementation, Describe and Once methods, and comprehensive unit tests for various matching scenarios. --- processing/text/processor.go | 18 +++++++++ processing/text/processor_test.go | 62 +++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/processing/text/processor.go b/processing/text/processor.go index 765896a..b28d751 100644 --- a/processing/text/processor.go +++ b/processing/text/processor.go @@ -24,6 +24,7 @@ var ( _ Processor = new(processor) _ Processor = new(multiProcessor) _ Processor = RegexpRemover{} + _ Processor = RegexpExtractor{} _ Processor = Cutter{} _ Processor = Trimmer{} _ Processor = LineToParagraph{} @@ -107,6 +108,23 @@ func (p RegexpRemover) Process(s string) (string, error) { return p.Re.ReplaceAllString(s, ""), nil } +// RegexpExtractor extracts the first substring that matches the given regular expression. +// If no match is found, it returns an empty string. +type RegexpExtractor struct { + Re *regexp.Regexp +} + +// Describe returns a string representation of the RegexpExtractor. +func (p RegexpExtractor) Describe() string { return fmt.Sprintf("RegexpExtractor(%q)", p.Re.String()) } + +// Once returns true, as extracting a specific part is a transformative operation usually done once. +func (RegexpExtractor) Once() bool { return true } + +// Process finds the first match of the regular expression in the input string. +func (p RegexpExtractor) Process(s string) (string, error) { + return p.Re.FindString(s), nil +} + // Cutter splits the input by the given separator and keeps only the part before it. type Cutter struct { Sep string diff --git a/processing/text/processor_test.go b/processing/text/processor_test.go index 9986da1..b90c2a7 100644 --- a/processing/text/processor_test.go +++ b/processing/text/processor_test.go @@ -1,6 +1,7 @@ package text import ( + "fmt" "regexp" "testing" ) @@ -23,6 +24,67 @@ func TestRegexpRemover(t *testing.T) { } } +func TestRegexpExtractor(t *testing.T) { + tests := []struct { + name string + pattern string + input string + want string + }{ + { + name: "Extract first number", + pattern: `\d+`, + input: "Order ID: 12345, Count: 67", + want: "12345", + }, + { + name: "Extract email", + pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, + input: "Please contact support@example.com for help.", + want: "support@example.com", + }, + { + name: "No match found", + pattern: `\d+`, + input: "No numbers here", + want: "", + }, + { + name: "Empty input", + pattern: `\d+`, + input: "", + want: "", + }, + { + name: "Match start of string", + pattern: `^Hello`, + input: "Hello World", + want: "Hello", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + re := regexp.MustCompile(tt.pattern) + p := RegexpExtractor{Re: re} + expectedDesc := "RegexpExtractor(" + fmt.Sprintf("%q", re.String()) + ")" + if desc := p.Describe(); desc != expectedDesc { + t.Errorf("Describe() = %v, want %v", desc, expectedDesc) + } + if !p.Once() { + t.Error("Once() = false, want true") + } + got, err := p.Process(tt.input) + if err != nil { + t.Errorf("Process() error = %v, wantErr %v", err, nil) + return + } + if got != tt.want { + t.Errorf("Process() = %q, want %q", got, tt.want) + } + }) + } +} + func TestCutter(t *testing.T) { for i, testcase := range []struct { seq string