From 60bb1aaa6580e2d77b6eb7ab66490826fcbcd3fd Mon Sep 17 00:00:00 2001
From: sunshineplan <sunshineplan@users.noreply.github.com>
Date: Thu, 4 Dec 2025 09:44:48 +0800
Subject: [PATCH] text: Add RegexpExtractor processor and tests

Introduces the RegexpExtractor type to extract the first substring matching a regular expression. Includes implementation, Describe and Once methods, and comprehensive unit tests for various matching scenarios.
---
 processing/text/processor.go      | 18 +++++++++
 processing/text/processor_test.go | 62 +++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/processing/text/processor.go b/processing/text/processor.go
index 765896a..b28d751 100644
--- a/processing/text/processor.go
+++ b/processing/text/processor.go
@@ -24,6 +24,7 @@ var (
 	_ Processor = new(processor)
 	_ Processor = new(multiProcessor)
 	_ Processor = RegexpRemover{}
+	_ Processor = RegexpExtractor{}
 	_ Processor = Cutter{}
 	_ Processor = Trimmer{}
 	_ Processor = LineToParagraph{}
@@ -107,6 +108,23 @@ func (p RegexpRemover) Process(s string) (string, error) {
 	return p.Re.ReplaceAllString(s, ""), nil
 }
 
+// RegexpExtractor extracts the first substring that matches the given regular expression.
+// If no match is found, it returns an empty string.
+type RegexpExtractor struct {
+	Re *regexp.Regexp
+}
+
+// Describe returns a string representation of the RegexpExtractor.
+func (p RegexpExtractor) Describe() string { return fmt.Sprintf("RegexpExtractor(%q)", p.Re.String()) }
+
+// Once returns true, as extracting a specific part is a transformative operation usually done once.
+func (RegexpExtractor) Once() bool { return true }
+
+// Process finds the first match of the regular expression in the input string.
+func (p RegexpExtractor) Process(s string) (string, error) {
+	return p.Re.FindString(s), nil
+}
+
 // Cutter splits the input by the given separator and keeps only the part before it.
 type Cutter struct {
 	Sep string
diff --git a/processing/text/processor_test.go b/processing/text/processor_test.go
index 9986da1..b90c2a7 100644
--- a/processing/text/processor_test.go
+++ b/processing/text/processor_test.go
@@ -1,6 +1,7 @@
 package text
 
 import (
+	"fmt"
 	"regexp"
 	"testing"
 )
@@ -23,6 +24,67 @@ func TestRegexpRemover(t *testing.T) {
 	}
 }
 
+func TestRegexpExtractor(t *testing.T) {
+	tests := []struct {
+		name    string
+		pattern string
+		input   string
+		want    string
+	}{
+		{
+			name:    "Extract first number",
+			pattern: `\d+`,
+			input:   "Order ID: 12345, Count: 67",
+			want:    "12345",
+		},
+		{
+			name:    "Extract email",
+			pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
+			input:   "Please contact support@example.com for help.",
+			want:    "support@example.com",
+		},
+		{
+			name:    "No match found",
+			pattern: `\d+`,
+			input:   "No numbers here",
+			want:    "",
+		},
+		{
+			name:    "Empty input",
+			pattern: `\d+`,
+			input:   "",
+			want:    "",
+		},
+		{
+			name:    "Match start of string",
+			pattern: `^Hello`,
+			input:   "Hello World",
+			want:    "Hello",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			re := regexp.MustCompile(tt.pattern)
+			p := RegexpExtractor{Re: re}
+			expectedDesc := "RegexpExtractor(" + fmt.Sprintf("%q", re.String()) + ")"
+			if desc := p.Describe(); desc != expectedDesc {
+				t.Errorf("Describe() = %v, want %v", desc, expectedDesc)
+			}
+			if !p.Once() {
+				t.Error("Once() = false, want true")
+			}
+			got, err := p.Process(tt.input)
+			if err != nil {
+				t.Errorf("Process() error = %v, wantErr %v", err, nil)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("Process() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
+
 func TestCutter(t *testing.T) {
 	for i, testcase := range []struct {
 		seq      string