Skip to content

Commit b07ebb6

Browse files
Merge pull request #385 from Good-Native/work/robots-cache
Cache robots.txt rules per domain
2 parents b049d70 + 8c96105 commit b07ebb6

3 files changed

Lines changed: 356 additions & 10 deletions

File tree

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,13 @@ On merge, CI will:
2828

2929
## [Unreleased]
3030

31-
_Add unreleased changes here._
31+
### Changed
32+
33+
- `JobManager.GetRobotsRules` now caches results per normalised domain (1h
34+
positive TTL, 60s negative TTL), and collapses concurrent misses onto a single
35+
origin fetch via singleflight. A long crawl previously refetched `/robots.txt`
36+
every five minutes (stream worker's job-info TTL) and a 429 on `/robots.txt`
37+
returned on the next read; both are now bounded.
3238

3339
## Full changelog history
3440

internal/jobs/manager.go

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/good-native/hover/internal/util"
2222
"github.com/google/uuid"
2323
"github.com/lib/pq"
24+
"golang.org/x/sync/singleflight"
2425
)
2526

2627
var jobsLog = logging.Component("jobs")
@@ -100,12 +101,39 @@ type JobManager struct {
100101
pagesMutex sync.RWMutex
101102

102103
sitemapSem chan struct{} // caps concurrent sitemap batch insertions across all jobs
104+
105+
// In-memory cache for robots.txt rules. Previously every job-info
106+
// cache miss in the stream worker (5 min TTL) refetched /robots.txt;
107+
// a long crawl hammered the origin and produced repeat 429s. Negative
108+
// entries are kept too so a 429 on /robots.txt doesn't get re-hit
109+
// every Release. Single-flight collapses concurrent misses for the
110+
// same domain into one origin fetch.
111+
robotsCache map[string]robotsCacheEntry
112+
robotsMutex sync.RWMutex
113+
robotsGroup singleflight.Group
114+
robotsTTLPos time.Duration // success TTL
115+
robotsTTLNeg time.Duration // failure / nil-rules TTL
116+
}
117+
118+
type robotsCacheEntry struct {
119+
rules *crawler.RobotsRules
120+
err error
121+
expiresAt time.Time
103122
}
104123

105124
type ProgressMilestoneCallback func(ctx context.Context, jobID string, oldPct, newPct int)
106125

107126
type JobTerminatedCallback func(ctx context.Context, jobID string)
108127

128+
// Defaults sized to swallow the stream worker's 5-minute job-info refresh
129+
// without re-hitting the origin: 1h success TTL means a long crawl fetches
130+
// /robots.txt at most a handful of times. Negative TTL is short so a
131+
// transient 429 on /robots.txt unblocks within ~60s.
132+
const (
133+
defaultRobotsTTLPositive = time.Hour
134+
defaultRobotsTTLNegative = 60 * time.Second
135+
)
136+
109137
func NewJobManager(db *sql.DB, dbQueue DbQueueProvider, crawler CrawlerInterface) *JobManager {
110138
sitemapConcurrency := sitemapInsertConcurrency()
111139
return &JobManager{
@@ -115,6 +143,9 @@ func NewJobManager(db *sql.DB, dbQueue DbQueueProvider, crawler CrawlerInterface
115143
processedPages: make(map[string]struct{}),
116144
lastMilestoneFired: make(map[string]int),
117145
sitemapSem: make(chan struct{}, sitemapConcurrency),
146+
robotsCache: make(map[string]robotsCacheEntry),
147+
robotsTTLPos: defaultRobotsTTLPositive,
148+
robotsTTLNeg: defaultRobotsTTLNegative,
118149
}
119150
}
120151

@@ -372,19 +403,74 @@ func (jm *JobManager) setupJobDatabase(ctx context.Context, job *Job, normalised
372403
const wafCacheTTL = 24 * time.Hour
373404

374405
// Returns nil (no error) when the crawler is unavailable; callers treat
375-
// nil rules as "no restriction".
406+
// nil rules as "no restriction". Results are cached per normalised domain:
407+
// successful fetches for robotsTTLPos, errors for robotsTTLNeg. Concurrent
408+
// misses for the same domain collapse onto a single origin fetch via
409+
// singleflight so a job-info burst can't trigger N parallel /robots.txt
410+
// requests.
376411
func (jm *JobManager) GetRobotsRules(ctx context.Context, domain string) (*crawler.RobotsRules, error) {
377412
if jm.crawler == nil {
378413
return nil, nil
379414
}
380-
result, err := jm.crawler.DiscoverSitemapsAndRobots(ctx, domain)
381-
if err != nil {
382-
return nil, fmt.Errorf("jobs: fetch robots for %s: %w", domain, err)
383-
}
384-
if result == nil {
385-
return nil, nil
386-
}
387-
return result.RobotsRules, nil
415+
416+
// Lowercase first: util.NormaliseDomain uses case-sensitive
417+
// TrimPrefix for the scheme/www strips, so an upper-case "HTTPS://"
418+
// would otherwise survive and split the cache.
419+
key := util.NormaliseDomain(strings.ToLower(domain))
420+
now := time.Now()
421+
422+
jm.robotsMutex.RLock()
423+
if entry, ok := jm.robotsCache[key]; ok && now.Before(entry.expiresAt) {
424+
jm.robotsMutex.RUnlock()
425+
return entry.rules, entry.err
426+
}
427+
jm.robotsMutex.RUnlock()
428+
429+
type robotsResult struct {
430+
rules *crawler.RobotsRules
431+
err error
432+
}
433+
val, _, _ := jm.robotsGroup.Do(key, func() (any, error) {
434+
// Fetch with the canonical key so the request and the cache
435+
// entry agree on the origin string. Without this, a scheme- or
436+
// case-variant input would fetch under the raw form while the
437+
// failure outcome got stored against the normalised key.
438+
result, fetchErr := jm.crawler.DiscoverSitemapsAndRobots(ctx, key)
439+
out := robotsResult{}
440+
if fetchErr != nil {
441+
out.err = fmt.Errorf("jobs: fetch robots for %s: %w", key, fetchErr)
442+
} else if result != nil {
443+
out.rules = result.RobotsRules
444+
}
445+
446+
// Don't negative-cache caller-side cancellations: one timed-out
447+
// caller would otherwise poison the shared cache for everyone
448+
// else for `robotsTTLNeg`.
449+
transient := out.err != nil && (errors.Is(fetchErr, context.Canceled) ||
450+
errors.Is(fetchErr, context.DeadlineExceeded))
451+
452+
if !transient {
453+
ttl := jm.robotsTTLPos
454+
if out.err != nil {
455+
ttl = jm.robotsTTLNeg
456+
}
457+
jm.robotsMutex.Lock()
458+
jm.robotsCache[key] = robotsCacheEntry{
459+
rules: out.rules,
460+
err: out.err,
461+
expiresAt: time.Now().Add(ttl),
462+
}
463+
jm.robotsMutex.Unlock()
464+
}
465+
466+
// singleflight.Do returns the inner error to all sharers; return
467+
// nil here and propagate err via the typed payload so a context
468+
// cancellation in one caller doesn't poison the cached failure.
469+
return out, nil
470+
})
471+
472+
res := val.(robotsResult)
473+
return res.rules, res.err
388474
}
389475

390476
func (jm *JobManager) validateRootURLAccess(ctx context.Context, job *Job, normalisedDomain string, rootPath string) (*crawler.RobotsRules, error) {

0 commit comments

Comments
 (0)