@@ -18,7 +18,10 @@ import (
1818 "github.com/ukeeper/ukeeper-readability/datastore"
1919)
2020
21- const defaultMaxGPTIter = 3
21+ const (
22+ defaultMaxGPTIter = 3
23+ aiEvaluatorUser = "ai-evaluator"
24+ )
2225
2326//go:generate moq -out mocks/rules.go -pkg mocks -skip-ensure -fmt goimports . Rules
2427
@@ -173,7 +176,6 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
173176 return rb , nil
174177}
175178
176- // maxGPTIter returns MaxGPTIter or the default if not set
177179func (f * UReadability ) maxGPTIter () int {
178180 if f .MaxGPTIter > 0 {
179181 return f .MaxGPTIter
@@ -220,29 +222,31 @@ func (f *UReadability) evaluateAndImprove(ctx context.Context, reqURL, htmlBody
220222 continue
221223 }
222224
223- // rebuild the response with new content
225+ // rebuild the response with new content (defer link normalisation and image extraction to after the loop)
224226 improved := * best
225227 improved .Content = f .getText (rawHTML , best .Title )
226228 improved .Rich = rawHTML
227229 improved .Excerpt = f .getSnippet (improved .Content )
228230
229- // normalize links and extract images from the new content
231+ best = & improved
232+ bestSelector = eval .Selector
233+ }
234+
235+ // post-process the final result: normalise links and extract images once
236+ if bestSelector != "" {
230237 finalURL , err := url .Parse (best .URL )
231238 if err != nil {
232239 log .Printf ("[WARN] failed to parse URL %q in evaluateAndImprove: %v" , best .URL , err )
233240 return best
234241 }
235- improved .Rich , improved .AllLinks = f .normalizeLinks (improved .Rich , finalURL )
236- darticle , err := goquery .NewDocumentFromReader (strings .NewReader (improved .Rich ))
242+ best .Rich , best .AllLinks = f .normalizeLinks (best .Rich , finalURL )
243+ darticle , err := goquery .NewDocumentFromReader (strings .NewReader (best .Rich ))
237244 if err == nil {
238245 if im , allImages , ok := f .extractPics (darticle .Find ("img" ), reqURL ); ok {
239- improved .Image = im
240- improved .AllImages = allImages
246+ best .Image = im
247+ best .AllImages = allImages
241248 }
242249 }
243-
244- best = & improved
245- bestSelector = eval .Selector
246250 }
247251
248252 // save rule if we found a better selector
@@ -254,7 +258,7 @@ func (f *UReadability) evaluateAndImprove(ctx context.Context, reqURL, htmlBody
254258 }
255259 rule .Content = bestSelector
256260 rule .Enabled = true
257- rule .User = "ai-evaluator"
261+ rule .User = aiEvaluatorUser
258262 if _ , err := f .Rules .Save (ctx , rule ); err != nil {
259263 log .Printf ("[WARN] failed to save AI-suggested rule for %s: %v" , best .Domain , err )
260264 } else {
@@ -298,16 +302,10 @@ func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule
298302 // custom rules parser
299303 customParser := func (body , reqURL string , rule datastore.Rule ) (content , rich string , err error ) {
300304 log .Printf ("[DEBUG] custom extractor for %s" , reqURL )
301- dbody , err := goquery . NewDocumentFromReader ( strings . NewReader ( body ) )
305+ res , err := f . extractWithSelector ( body , rule . Content )
302306 if err != nil {
303307 return "" , "" , err
304308 }
305- var res string
306- dbody .Find (rule .Content ).Each (func (_ int , s * goquery.Selection ) {
307- if html , err := s .Html (); err == nil {
308- res += html
309- }
310- })
311309 if res == "" {
312310 return "" , "" , fmt .Errorf ("nothing extracted from %s, rule=%v" , reqURL , rule )
313311 }
0 commit comments