@@ -21,6 +21,7 @@ import (
2121 "github.com/good-native/hover/internal/util"
2222 "github.com/google/uuid"
2323 "github.com/lib/pq"
24+ "golang.org/x/sync/singleflight"
2425)
2526
2627var jobsLog = logging .Component ("jobs" )
@@ -100,12 +101,39 @@ type JobManager struct {
100101 pagesMutex sync.RWMutex
101102
102103 sitemapSem chan struct {} // caps concurrent sitemap batch insertions across all jobs
104+
105+ // In-memory cache for robots.txt rules. Previously every job-info
106+ // cache miss in the stream worker (5 min TTL) refetched /robots.txt;
107+ // a long crawl hammered the origin and produced repeat 429s. Negative
108+ // entries are kept too so a 429 on /robots.txt doesn't get re-hit
109+ // every Release. Single-flight collapses concurrent misses for the
110+ // same domain into one origin fetch.
111+ robotsCache map [string ]robotsCacheEntry
112+ robotsMutex sync.RWMutex
113+ robotsGroup singleflight.Group
114+ robotsTTLPos time.Duration // success TTL
115+ robotsTTLNeg time.Duration // failure / nil-rules TTL
116+ }
117+
118+ type robotsCacheEntry struct {
119+ rules * crawler.RobotsRules
120+ err error
121+ expiresAt time.Time
103122}
104123
105124type ProgressMilestoneCallback func (ctx context.Context , jobID string , oldPct , newPct int )
106125
107126type JobTerminatedCallback func (ctx context.Context , jobID string )
108127
128+ // Defaults sized to swallow the stream worker's 5-minute job-info refresh
129+ // without re-hitting the origin: 1h success TTL means a long crawl fetches
130+ // /robots.txt at most a handful of times. Negative TTL is short so a
131+ // transient 429 on /robots.txt unblocks within ~60s.
132+ const (
133+ defaultRobotsTTLPositive = time .Hour
134+ defaultRobotsTTLNegative = 60 * time .Second
135+ )
136+
109137func NewJobManager (db * sql.DB , dbQueue DbQueueProvider , crawler CrawlerInterface ) * JobManager {
110138 sitemapConcurrency := sitemapInsertConcurrency ()
111139 return & JobManager {
@@ -115,6 +143,9 @@ func NewJobManager(db *sql.DB, dbQueue DbQueueProvider, crawler CrawlerInterface
115143 processedPages : make (map [string ]struct {}),
116144 lastMilestoneFired : make (map [string ]int ),
117145 sitemapSem : make (chan struct {}, sitemapConcurrency ),
146+ robotsCache : make (map [string ]robotsCacheEntry ),
147+ robotsTTLPos : defaultRobotsTTLPositive ,
148+ robotsTTLNeg : defaultRobotsTTLNegative ,
118149 }
119150}
120151
@@ -372,19 +403,74 @@ func (jm *JobManager) setupJobDatabase(ctx context.Context, job *Job, normalised
372403const wafCacheTTL = 24 * time .Hour
373404
374405// Returns nil (no error) when the crawler is unavailable; callers treat
375- // nil rules as "no restriction".
406+ // nil rules as "no restriction". Results are cached per normalised domain:
407+ // successful fetches for robotsTTLPos, errors for robotsTTLNeg. Concurrent
408+ // misses for the same domain collapse onto a single origin fetch via
409+ // singleflight so a job-info burst can't trigger N parallel /robots.txt
410+ // requests.
376411func (jm * JobManager ) GetRobotsRules (ctx context.Context , domain string ) (* crawler.RobotsRules , error ) {
377412 if jm .crawler == nil {
378413 return nil , nil
379414 }
380- result , err := jm .crawler .DiscoverSitemapsAndRobots (ctx , domain )
381- if err != nil {
382- return nil , fmt .Errorf ("jobs: fetch robots for %s: %w" , domain , err )
383- }
384- if result == nil {
385- return nil , nil
386- }
387- return result .RobotsRules , nil
415+
416+ // Lowercase first: util.NormaliseDomain uses case-sensitive
417+ // TrimPrefix for the scheme/www strips, so an upper-case "HTTPS://"
418+ // would otherwise survive and split the cache.
419+ key := util .NormaliseDomain (strings .ToLower (domain ))
420+ now := time .Now ()
421+
422+ jm .robotsMutex .RLock ()
423+ if entry , ok := jm .robotsCache [key ]; ok && now .Before (entry .expiresAt ) {
424+ jm .robotsMutex .RUnlock ()
425+ return entry .rules , entry .err
426+ }
427+ jm .robotsMutex .RUnlock ()
428+
429+ type robotsResult struct {
430+ rules * crawler.RobotsRules
431+ err error
432+ }
433+ val , _ , _ := jm .robotsGroup .Do (key , func () (any , error ) {
434+ // Fetch with the canonical key so the request and the cache
435+ // entry agree on the origin string. Without this, a scheme- or
436+ // case-variant input would fetch under the raw form while the
437+ // failure outcome got stored against the normalised key.
438+ result , fetchErr := jm .crawler .DiscoverSitemapsAndRobots (ctx , key )
439+ out := robotsResult {}
440+ if fetchErr != nil {
441+ out .err = fmt .Errorf ("jobs: fetch robots for %s: %w" , key , fetchErr )
442+ } else if result != nil {
443+ out .rules = result .RobotsRules
444+ }
445+
446+ // Don't negative-cache caller-side cancellations: one timed-out
447+ // caller would otherwise poison the shared cache for everyone
448+ // else for `robotsTTLNeg`.
449+ transient := out .err != nil && (errors .Is (fetchErr , context .Canceled ) ||
450+ errors .Is (fetchErr , context .DeadlineExceeded ))
451+
452+ if ! transient {
453+ ttl := jm .robotsTTLPos
454+ if out .err != nil {
455+ ttl = jm .robotsTTLNeg
456+ }
457+ jm .robotsMutex .Lock ()
458+ jm .robotsCache [key ] = robotsCacheEntry {
459+ rules : out .rules ,
460+ err : out .err ,
461+ expiresAt : time .Now ().Add (ttl ),
462+ }
463+ jm .robotsMutex .Unlock ()
464+ }
465+
466+ // singleflight.Do returns the inner error to all sharers; return
467+ // nil here and propagate err via the typed payload so a context
468+ // cancellation in one caller doesn't poison the cached failure.
469+ return out , nil
470+ })
471+
472+ res := val .(robotsResult )
473+ return res .rules , res .err
388474}
389475
390476func (jm * JobManager ) validateRootURLAccess (ctx context.Context , job * Job , normalisedDomain string , rootPath string ) (* crawler.RobotsRules , error ) {
0 commit comments