Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,19 @@ On merge, CI will:

## [Unreleased]

_Add unreleased changes here._
### Fixed

- WAF circuit breaker no longer trips on recoverable Cloudflare `Cf-Mitigated`
values (`challenge`, `jschallenge`, `managed_challenge`, `rate_limited`); the
403/429 status code still drives pacer back-off. Only `block` (and unknown
values) trips the breaker.

### Added

- Pacer warm-up floor: never-crawled domains seed `adaptive_delay_ms` to
`GNH_PACER_WARMUP_DELAY_MS` (default 2000) instead of 0, so the per-domain
inflight cap is active from the first dispatch. Steps down via the existing
success path.

## Full changelog history

Expand Down
22 changes: 15 additions & 7 deletions internal/crawler/waf.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ const (
WAFVendorGeneric = "generic"
)

// Only "block" (and unknown values) trip the circuit breaker; these
// values denote recoverable CF actions that the 403/429 pacer path
// already handles without terminating the job.
var cfRecoverableMitigations = map[string]struct{}{
"challenge": {},
"jschallenge": {},
"managed_challenge": {},
"rate_limited": {},
}

// DetectWAF inspects a response and reports whether it carries a
// fingerprint of a known bot-protection layer. The function is pure: no
// I/O, safe for table-driven tests. It is intentionally conservative on
Expand All @@ -52,16 +62,14 @@ func DetectWAF(statusCode int, headers http.Header, bodySample []byte) WAFDetect

blocking := isBlockingStatus(statusCode)

// cf-mitigated indicates Cloudflare took bot-management action
// (challenge, block, jschallenge, managed_challenge). It is only
// emitted on responses where CF intervened, so a non-empty value on
// any non-200 status is a reliable signal — including 429 challenges
// against datacentre egress IPs.
if v := strings.TrimSpace(headers.Get("Cf-Mitigated")); v != "" && statusCode != http.StatusOK {
if v := strings.ToLower(strings.TrimSpace(headers.Get("Cf-Mitigated"))); v != "" && statusCode != http.StatusOK {
if _, recoverable := cfRecoverableMitigations[v]; recoverable {
return WAFDetection{}
}
return WAFDetection{
Blocked: true,
Vendor: WAFVendorCloudflare,
Reason: "cf-mitigated header present on " + statusLabel(statusCode),
Reason: "cf-mitigated=" + v + " on " + statusLabel(statusCode),
}
}

Expand Down
82 changes: 68 additions & 14 deletions internal/crawler/waf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,34 +111,88 @@ func TestDetectWAF(t *testing.T) {
reasonPrefix: "tiny body",
},
{
name: "cloudflare — cf-mitigated header on 403",
name: "cloudflare — cf-mitigated=challenge on 403 is recoverable",
status: http.StatusForbidden,
headers: http.Header{
"Cf-Mitigated": []string{"challenge"},
"Server": []string{"cloudflare"},
},
body: []byte("Just a moment..."),
wantBlocked: true,
wantVendor: WAFVendorCloudflare,
reasonPrefix: "cf-mitigated",
body: []byte("Just a moment..."),
wantBlocked: false,
},
{
// Real failure mode observed against Shopify storefronts when
// CF "Super Bot Fight Mode" is enabled: CF returns 429 with
// the challenge HTML in the body and Cf-Mitigated: challenge.
// Prior to this case, isBlockingStatus only allowed 403/202
// so the detector silently no-op'd and jobs burnt 3 retries
// before failing with a misleading "Too Many Requests" error.
name: "cloudflare — cf-mitigated challenge on 429",
name: "cloudflare — cf-mitigated=challenge on 429 is recoverable",
status: http.StatusTooManyRequests,
headers: http.Header{
"Cf-Mitigated": []string{"challenge"},
"Server": []string{"cloudflare"},
},
body: []byte(strings.Repeat("x", 9000)), // CF challenge page is ~9KB
body: []byte(strings.Repeat("x", 9000)),
wantBlocked: false,
},
{
name: "cloudflare — cf-mitigated=managed_challenge on 403 is recoverable",
status: http.StatusForbidden,
headers: http.Header{
"Cf-Mitigated": []string{"managed_challenge"},
"Server": []string{"cloudflare"},
},
body: []byte("checking your browser"),
wantBlocked: false,
},
{
name: "cloudflare — cf-mitigated=jschallenge on 403 is recoverable",
status: http.StatusForbidden,
headers: http.Header{
"Cf-Mitigated": []string{"jschallenge"},
"Server": []string{"cloudflare"},
},
body: []byte("challenge page"),
wantBlocked: false,
},
{
name: "cloudflare — cf-mitigated=rate_limited on 429 is recoverable",
status: http.StatusTooManyRequests,
headers: http.Header{
"Cf-Mitigated": []string{"rate_limited"},
"Server": []string{"cloudflare"},
},
body: []byte("rate limited"),
wantBlocked: false,
},
{
name: "cloudflare — cf-mitigated normalisation (case/space) is recoverable",
status: http.StatusForbidden,
headers: http.Header{
"Cf-Mitigated": []string{" Managed_Challenge "},
"Server": []string{"cloudflare"},
},
body: []byte("checking your browser"),
wantBlocked: false,
},
{
name: "cloudflare — cf-mitigated=block on 403 is a hard block",
status: http.StatusForbidden,
headers: http.Header{
"Cf-Mitigated": []string{"block"},
"Server": []string{"cloudflare"},
},
body: []byte("Access denied"),
wantBlocked: true,
wantVendor: WAFVendorCloudflare,
reasonPrefix: "cf-mitigated=block on 403",
},
{
name: "cloudflare — cf-mitigated=BLOCK uppercase still trips",
status: http.StatusForbidden,
headers: http.Header{
"Cf-Mitigated": []string{"BLOCK"},
"Server": []string{"cloudflare"},
},
body: []byte("Access denied"),
wantBlocked: true,
wantVendor: WAFVendorCloudflare,
reasonPrefix: "cf-mitigated header present on 429",
reasonPrefix: "cf-mitigated=block on 403",
},
{
name: "cloudflare — cf-mitigated alone on 200 must NOT trip (caching path)",
Expand Down
19 changes: 19 additions & 0 deletions internal/jobs/stream_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,19 @@ func DefaultStreamWorkerOpts() StreamWorkerOpts {

const jobInfoTTL = 5 * time.Minute

const defaultPacerWarmupDelayMS = 2000

func pacerWarmupDelayMS() int {
if v := strings.TrimSpace(os.Getenv("GNH_PACER_WARMUP_DELAY_MS")); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 0 {
return n
}
jobsLog.Warn("invalid GNH_PACER_WARMUP_DELAY_MS; using default",
"value", v, "default", defaultPacerWarmupDelayMS)
}
return defaultPacerWarmupDelayMS
}

type cachedJobInfo struct {
info *JobInfo
expiresAt time.Time
Expand Down Expand Up @@ -759,6 +772,12 @@ func (swp *StreamWorkerPool) fetchJobInfo(ctx context.Context, jobID string) (*J
baseDelayMS := info.CrawlDelay * 1000
adaptiveDelayMS := info.AdaptiveDelay * 1000
floorMS := info.AdaptiveDelayFloor * 1000
// NULL means never crawled; a stored 0 is a learned value we trust.
if !adaptiveDelay.Valid {
if warmup := pacerWarmupDelayMS(); warmup > 0 {
adaptiveDelayMS = warmup
}
}
if seedErr := swp.pacer.Seed(ctx, info.DomainName, baseDelayMS, adaptiveDelayMS, floorMS); seedErr != nil {
jobsLog.Warn("pacer seed from postgres failed, continuing",
"error", seedErr, "domain", info.DomainName)
Expand Down
Loading