hover/fly.worker.toml at main · Good-Native/hover · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# fly.worker.toml — Fly.io configuration for the hover worker service.
# The worker consumes tasks from Redis Streams, executes crawls,
# and persists results to Postgres.

app = 'hover-worker'
primary_region = 'syd'

[build]

[env]
  APP_ENV = "production"
  WORKER_COUNT = "30"
  WORKER_CONCURRENCY = "20"
  GNH_MAX_WORKERS = "130"
  # Admission breadth for the jobs query that feeds the dispatcher's
  # ActiveJobIDs list. Pre-merge formula: max(GNH_MAX_WORKERS × factor, min).
  # At 130 × 3 = 390 this keeps the dispatcher iterating over enough jobs
  # that a single slow/blocked job can't starve the others.
  GNH_PENDING_ADMISSION_LIMIT_MIN = "250"
  GNH_PENDING_ADMISSION_WORKER_FACTOR = "3"
  # Pre-merge (2de8edec) ran a single app with 125 open / 30 idle and Grafana
  # peaked at 179 pgbouncer / 93 postgres connections during the 4k tasks/min
  # run. The worker now owns all crawl execution, so it keeps the bigger pool;
  # the API (fly.toml) stays lean at 60 because it only serves HTTP handlers.
  DB_MAX_OPEN_CONNS = "125"
  DB_MAX_IDLE_CONNS = "30"
  # Without these the worker defaults to DB_QUEUE_MAX_CONCURRENCY=12 (queue.go)
  # and DB_TX_MAX_RETRIES=3 — throttling 130 workers down to 12 concurrent DB
  # operations and giving up earlier on transient serialisation failures than
  # the API does. Pre-merge these ran inside the API at 100 / 5.
  DB_QUEUE_MAX_CONCURRENCY = "100"
  DB_TX_MAX_RETRIES = "5"
  GNH_PRESSURE_HIGH_MARK_MS = "80"
  GNH_PRESSURE_LOW_MARK_MS = "40"
  # GNH_PRESSURE_INITIAL_LIMIT intentionally unset — pressure.go defaults it to
  # DB_QUEUE_MAX_CONCURRENCY (safeMax). Setting it explicitly invites drift and
  # the "exceeds queue cap" clamp warning if the two values get out of sync.
  GNH_PRESSURE_MIN_LIMIT = "30"
  GNH_PRESSURE_STEP_DOWN = "5"
  REDIS_POOL_SIZE = "200"
  REDIS_DISPATCH_INTERVAL_MS = "100"
  # Up from 50 → 100. Dispatcher sweeps due tasks from each job's ZSET
  # into its Stream at this batch size per tick. With 30 active jobs
  # and a 100ms tick, the ceiling was 30 × 50 × 10 = 15k/min. Doubling
  # gives 30k/min headroom — still bounded by CanDispatch per-job cap.
  REDIS_DISPATCH_BATCH_SIZE = "100"
  REDIS_CONSUMER_BLOCK_MS = "2000"
  # Messages fetched per XREADGROUP call. Default 10 (code default) avoids
  # the Count=1 thundering-herd of a Redis round-trip per message; the
  # worker's own sem still caps in-flight parallelism.
  REDIS_CONSUMER_READ_COUNT = "10"
  REDIS_AUTOCLAIM_INTERVAL_S = "30"
  # Per-call XAUTOCLAIM COUNT and per-tick safety cap. With Count=100 and
  # a cap of 1000, a single sweep can drain a stuck PEL without blowing
  # out the tick when one job has thousands of stale messages.
  REDIS_AUTOCLAIM_COUNT = "100"
  REDIS_AUTOCLAIM_MAX_PER_SWEEP = "1000"
  # XAUTOCLAIM min-idle-time (seconds). Messages pending longer than this
  # are eligible for reclaim by another consumer. Matches pre-env default.
  REDIS_AUTOCLAIM_MIN_IDLE_S = "180"
  # Dead-letter a message after this many deliveries. Matches pre-env default.
  REDIS_AUTOCLAIM_MAX_DELIVERIES = "3"
  REDIS_COUNTER_SYNC_INTERVAL_S = "5"
  # How often the worker rebuilds the Redis running-counters HASH from the
  # authoritative XPENDING view. Self-heals any mid-run drift.
  REDIS_COUNTER_RECONCILE_INTERVAL_S = "120"
  # Outbox sweep: moves task_outbox rows into the Redis ZSET. At 200ms ×
  # 500 this caps ingest at 150k/min — well above current peak and keeps
  # newly-discovered siblings from waiting a full tick after a parent
  # completes. The sweep is an index-only SKIP LOCKED query so the extra
  # frequency is negligible.
  OUTBOX_SWEEP_INTERVAL_MS = "200"
  OUTBOX_SWEEP_BATCH_SIZE = "500"
  GNH_BATCH_CHANNEL_SIZE = "5000"
  # Batch size matches pre-merge fly.toml (16). Was bumped to 32
  # post-merge without benchmarking — reverting to the value that ran
  # 4k tasks/min in production.
  GNH_RUNNING_TASK_BATCH_SIZE = "16"
  GNH_RUNNING_TASK_FLUSH_INTERVAL_MS = "50"
  # Matches pre-merge fly.toml. Was lowered to 0.5 post-merge — restoring
  # to the value that ran 4k tasks/min in production.
  GNH_LINK_DISCOVERY_MIN_PRIORITY = "0.7"
  # Crawler concurrency for this worker (pre-merge value on fly.toml). Without
  # this the crawler falls back to MaxConcurrency=10 (internal/crawler/config.go
  # DefaultConfig) — a 10× reduction in per-collector parallelism. The Redis PR
  # left this on fly.toml (API) but the API no longer crawls; the worker does.
  GNH_CRAWLER_MAX_CONCURRENCY = "100"
  # Sitemap ingest pacing (pre-merge defaults from fly.toml). Controls how
  # quickly the sitemap processor batches discovered URLs into the task table,
  # which feeds the dispatcher ZSET. Defaults are lower (3 / 50 / 200).
  GNH_SITEMAP_CONCURRENCY = "5"
  GNH_SITEMAP_BATCH_SIZE = "100"
  GNH_SITEMAP_BATCH_DELAY_MS = "200"
  GNH_RATE_LIMIT_BASE_DELAY_MS = "50"
  # Pacer tuning values match the pre-merge DomainLimiter defaults
  # (internal/jobs/domain_limiter.go in pre-merge tree): step_up=500ms,
  # threshold=5 successes, max=60s, symmetric step-down. Pre-merge ran
  # 4k tasks/min with these defaults untouched — the in-memory limiter
  # reset on every worker restart, which implicitly bounded how long a
  # domain could stay throttled. Post-merge the equivalent reset is the
  # FlushAdaptiveDelays call on worker boot (cmd/worker/main.go); that
  # restores the pre-merge semantics without needing to reinvent the
  # ramp/recovery tuning.
  GNH_RATE_LIMIT_MAX_DELAY_MS = "60000"
  # Floor on DomainPacer RetryAfter when the domain gate is held. Without
  # this the dispatcher re-fetches rate-limited tasks every tick (100ms)
  # and spins on push-back. Pre-merge this was domainDelayPause (100ms).
  GNH_DOMAIN_DELAY_PAUSE_MS = "100"
  # Cap on concurrent ProcessDiscoveredLinks calls. Set to 128 to lift the
  # ~3–3.5k tasks/min ceiling the original 32-cap (15d146f3) imposed on the
  # bulk DB lane. Still well below the 2k+ goroutine event that motivated
  # the cap, since each call now releases its bulk-pool slot in milliseconds
  # rather than seconds (counter-sync no longer holds wide row locks).
  JOBS_LINK_DISCOVERY_MAX_INFLIGHT = "128"
  LOG_LEVEL = "info"
  OBSERVABILITY_ENABLED = "true"
  # Push metrics + traces to Grafana Cloud. Without this, the worker
  # registers instruments but never exports them — worker-only telemetry
  # (bee.worker.*, bee.db.*, bee.broker.*) vanishes. Auth header is
  # injected via CI secrets.
  OTEL_EXPORTER_OTLP_ENDPOINT = "https://otlp-gateway-prod-au-southeast-1.grafana.net/otlp/v1/traces"
  ARCHIVE_PROVIDER = "r2"
  ARCHIVE_BUCKET = "native-hover-archive"

[deploy]
  strategy = "immediate"

# Fly's managed Prometheus scrapes every machine that declares this block.
# bee_broker_* + bee_worker_* metrics from the OTel exporter at :9464 land
# in https://api.fly.io/prometheus/personal so fly-autoscaler can read them
# without Grafana Cloud Basic-Auth gymnastics. Alloy's separate push to
# Grafana Cloud is unaffected.
[metrics]
  port = 9464
  path = "/metrics"

[processes]
  # Launch via start.sh so the Alloy metrics sidecar runs alongside the
  # worker binary. Running ./worker directly skips Alloy, which silently
  # drops every bee.worker.* and bee.broker.* metric from this process.
  worker = "./start.sh worker"

# Always restart on exit. The worker has no [http_service] block, so Fly
# has no health-check hook to wake it up, and the default on-failure
# policy gives up after 10 retries — a bad hour (Redis blip, DB failover)
# could otherwise leave the worker permanently stopped.
[[restart]]
  policy = "always"

[[vm]]
  memory = '4gb'
  cpu_kind = 'performance'
  cpus = 1