Skip to content

Commit ddc9c7b

Browse files
committed
fix: deduplicate extraction logic, remove dead code, fix doc comments
- customParser now delegates to extractWithSelector (eliminates duplicated goquery parse+find+html loop) - image extraction moved out of evaluation loop — runs once on final result instead of every iteration - extract "ai-evaluator" to aiEvaluatorUser constant - fix incorrect doc comment on callAPI - remove unused getAuth test helper - remove redundant cancel() call and restating comments
1 parent 4df81fa commit ddc9c7b

3 files changed

Lines changed: 18 additions & 26 deletions

File tree

extractor/evaluator.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ type OpenAIEvaluator struct {
4545
client *openai.Client
4646
}
4747

48-
// getClient returns the OpenAI client, creating it once on first use
4948
func (e *OpenAIEvaluator) getClient() *openai.Client {
5049
e.clientOnce.Do(func() {
5150
if e.clientConfig != nil {
@@ -71,7 +70,6 @@ func (e *OpenAIEvaluator) Evaluate(ctx context.Context, reqURL, extractedText, h
7170
if !errors.Is(err, errInvalidJSON) {
7271
return nil, err
7372
}
74-
cancel() // release the first context before creating a new one
7573

7674
// retry once on invalid JSON with a fresh timeout
7775
log.Printf("[WARN] invalid JSON from OpenAI for %s, retrying once", reqURL)
@@ -87,7 +85,7 @@ func (e *OpenAIEvaluator) Evaluate(ctx context.Context, reqURL, extractedText, h
8785
}
8886

8987
// callAPI makes a single API call and parses the response JSON.
90-
// Returns nil EvalResult (without error) if the response is not valid JSON.
88+
// returns errInvalidJSON if the response is not valid JSON.
9189
func (e *OpenAIEvaluator) callAPI(ctx context.Context, client *openai.Client, userPrompt string) (*EvalResult, error) {
9290
resp, err := client.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
9391
Model: e.Model,

extractor/readability.go

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ import (
1818
"github.com/ukeeper/ukeeper-readability/datastore"
1919
)
2020

21-
const defaultMaxGPTIter = 3
21+
const (
22+
defaultMaxGPTIter = 3
23+
aiEvaluatorUser = "ai-evaluator"
24+
)
2225

2326
//go:generate moq -out mocks/rules.go -pkg mocks -skip-ensure -fmt goimports . Rules
2427

@@ -173,7 +176,6 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
173176
return rb, nil
174177
}
175178

176-
// maxGPTIter returns MaxGPTIter or the default if not set
177179
func (f *UReadability) maxGPTIter() int {
178180
if f.MaxGPTIter > 0 {
179181
return f.MaxGPTIter
@@ -220,29 +222,31 @@ func (f *UReadability) evaluateAndImprove(ctx context.Context, reqURL, htmlBody
220222
continue
221223
}
222224

223-
// rebuild the response with new content
225+
// rebuild the response with new content (defer link normalisation and image extraction to after the loop)
224226
improved := *best
225227
improved.Content = f.getText(rawHTML, best.Title)
226228
improved.Rich = rawHTML
227229
improved.Excerpt = f.getSnippet(improved.Content)
228230

229-
// normalize links and extract images from the new content
231+
best = &improved
232+
bestSelector = eval.Selector
233+
}
234+
235+
// post-process the final result: normalise links and extract images once
236+
if bestSelector != "" {
230237
finalURL, err := url.Parse(best.URL)
231238
if err != nil {
232239
log.Printf("[WARN] failed to parse URL %q in evaluateAndImprove: %v", best.URL, err)
233240
return best
234241
}
235-
improved.Rich, improved.AllLinks = f.normalizeLinks(improved.Rich, finalURL)
236-
darticle, err := goquery.NewDocumentFromReader(strings.NewReader(improved.Rich))
242+
best.Rich, best.AllLinks = f.normalizeLinks(best.Rich, finalURL)
243+
darticle, err := goquery.NewDocumentFromReader(strings.NewReader(best.Rich))
237244
if err == nil {
238245
if im, allImages, ok := f.extractPics(darticle.Find("img"), reqURL); ok {
239-
improved.Image = im
240-
improved.AllImages = allImages
246+
best.Image = im
247+
best.AllImages = allImages
241248
}
242249
}
243-
244-
best = &improved
245-
bestSelector = eval.Selector
246250
}
247251

248252
// save rule if we found a better selector
@@ -254,7 +258,7 @@ func (f *UReadability) evaluateAndImprove(ctx context.Context, reqURL, htmlBody
254258
}
255259
rule.Content = bestSelector
256260
rule.Enabled = true
257-
rule.User = "ai-evaluator"
261+
rule.User = aiEvaluatorUser
258262
if _, err := f.Rules.Save(ctx, rule); err != nil {
259263
log.Printf("[WARN] failed to save AI-suggested rule for %s: %v", best.Domain, err)
260264
} else {
@@ -298,16 +302,10 @@ func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule
298302
// custom rules parser
299303
customParser := func(body, reqURL string, rule datastore.Rule) (content, rich string, err error) {
300304
log.Printf("[DEBUG] custom extractor for %s", reqURL)
301-
dbody, err := goquery.NewDocumentFromReader(strings.NewReader(body))
305+
res, err := f.extractWithSelector(body, rule.Content)
302306
if err != nil {
303307
return "", "", err
304308
}
305-
var res string
306-
dbody.Find(rule.Content).Each(func(_ int, s *goquery.Selection) {
307-
if html, err := s.Html(); err == nil {
308-
res += html
309-
}
310-
})
311309
if res == "" {
312310
return "", "", fmt.Errorf("nothing extracted from %s, rule=%v", reqURL, rule)
313311
}

rest/server_test.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -657,10 +657,6 @@ func TestServer_ContentParsedWrong(t *testing.T) {
657657
})
658658
}
659659

660-
func getAuth(t *testing.T, url string) (response string, statusCode int) {
661-
return doAuth(t, "GET", url)
662-
}
663-
664660
func postAuth(t *testing.T, url string) (response string, statusCode int) {
665661
return doAuth(t, "POST", url)
666662
}

0 commit comments

Comments
 (0)