-
Notifications
You must be signed in to change notification settings - Fork 29
DurableEmitter: LOOP Plugin Support #2073
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
DylanTinianov
wants to merge
14
commits into
main
Choose a base branch
from
CRE-3933-loop-plugin-support
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
038fe22
LOOP support
DylanTinianov 1837553
Extract setup
DylanTinianov 64c703a
Update durable_emitter.go
DylanTinianov 71a8313
Rename retransmit
DylanTinianov f23047d
Update config.go
DylanTinianov 601238c
Rename
DylanTinianov 63cb609
Move event store
DylanTinianov 972243c
Rename store
DylanTinianov c8bb5e3
Move durable emitter to beholder start
DylanTinianov a612830
Delete durable_event_store.go
DylanTinianov eb1cd2c
Update server.go
DylanTinianov 5acdb28
Move setup to client method
DylanTinianov dabe6d6
Use chip logger
DylanTinianov 16779f4
Update server.go
DylanTinianov File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,209 @@ | ||
| // Package beholderstore provides a Postgres-backed implementation of | ||
| // beholder.DurableEventStore. It is kept in a sibling package to pkg/beholder | ||
| // so that consumers of the beholder API (including builds targeting wasip1) | ||
| // do not transitively import lib/pq. | ||
| package beholderstore | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/lib/pq" | ||
|
|
||
| "github.com/smartcontractkit/chainlink-common/pkg/beholder" | ||
| "github.com/smartcontractkit/chainlink-common/pkg/sqlutil" | ||
| ) | ||
|
|
||
| const chipDurableEventsTable = "cre.chip_durable_events" | ||
|
|
||
| // Store is a Postgres-backed implementation of beholder.DurableEventStore. | ||
| type Store struct { | ||
| ds sqlutil.DataSource | ||
| } | ||
|
|
||
| var ( | ||
| _ beholder.DurableEventStore = (*Store)(nil) | ||
| _ beholder.DurableQueueObserver = (*Store)(nil) | ||
| _ beholder.BatchInserter = (*Store)(nil) | ||
| ) | ||
|
|
||
| // New returns a Postgres-backed DurableEventStore bound to ds. | ||
| func New(ds sqlutil.DataSource) *Store { | ||
| return &Store{ds: ds} | ||
| } | ||
|
|
||
| func (s *Store) Insert(ctx context.Context, payload []byte) (int64, error) { | ||
| const q = `INSERT INTO ` + chipDurableEventsTable + ` (payload) VALUES ($1) RETURNING id` | ||
| var id int64 | ||
| if err := s.ds.GetContext(ctx, &id, q, payload); err != nil { | ||
| return 0, fmt.Errorf("failed to insert chip durable event: %w", err) | ||
| } | ||
| return id, nil | ||
| } | ||
|
|
||
| func (s *Store) InsertBatch(ctx context.Context, payloads [][]byte) ([]int64, error) { | ||
| if len(payloads) == 0 { | ||
| return nil, nil | ||
| } | ||
| placeholders := make([]string, len(payloads)) | ||
| args := make([]interface{}, len(payloads)) | ||
| for i, p := range payloads { | ||
| placeholders[i] = fmt.Sprintf("($%d)", i+1) | ||
| args[i] = p | ||
| } | ||
| q := fmt.Sprintf( | ||
| "INSERT INTO %s (payload) VALUES %s RETURNING id", | ||
| chipDurableEventsTable, | ||
| strings.Join(placeholders, ","), | ||
| ) | ||
|
|
||
| var ids []int64 | ||
| if err := s.ds.SelectContext(ctx, &ids, q, args...); err != nil { | ||
| return nil, fmt.Errorf("failed to batch insert chip durable events: %w", err) | ||
| } | ||
| return ids, nil | ||
| } | ||
|
|
||
| func (s *Store) Delete(ctx context.Context, id int64) error { | ||
| const q = `DELETE FROM ` + chipDurableEventsTable + ` WHERE id = $1` | ||
| if _, err := s.ds.ExecContext(ctx, q, id); err != nil { | ||
| return fmt.Errorf("failed to delete chip durable event id=%d: %w", id, err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (s *Store) MarkDelivered(ctx context.Context, id int64) error { | ||
| const q = `UPDATE ` + chipDurableEventsTable + ` SET delivered_at = now() WHERE id = $1 AND delivered_at IS NULL` | ||
| if _, err := s.ds.ExecContext(ctx, q, id); err != nil { | ||
| return fmt.Errorf("failed to mark chip durable event delivered id=%d: %w", id, err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (s *Store) MarkDeliveredBatch(ctx context.Context, ids []int64) (int64, error) { | ||
| if len(ids) == 0 { | ||
| return 0, nil | ||
| } | ||
| const q = `UPDATE ` + chipDurableEventsTable + ` SET delivered_at = now() WHERE id = ANY($1) AND delivered_at IS NULL` | ||
| res, err := s.ds.ExecContext(ctx, q, pq.Array(ids)) | ||
| if err != nil { | ||
| return 0, fmt.Errorf("failed to batch mark chip durable events delivered: %w", err) | ||
| } | ||
| n, _ := res.RowsAffected() | ||
| return n, nil | ||
| } | ||
|
|
||
| func (s *Store) PurgeDelivered(ctx context.Context, batchLimit int) (int64, error) { | ||
| if batchLimit <= 0 { | ||
| return 0, nil | ||
| } | ||
| const q = ` | ||
| WITH picked AS ( | ||
| SELECT id FROM ` + chipDurableEventsTable + ` | ||
| WHERE delivered_at IS NOT NULL | ||
| ORDER BY delivered_at ASC | ||
| LIMIT $1 | ||
| ) | ||
| DELETE FROM ` + chipDurableEventsTable + ` AS t | ||
| USING picked WHERE t.id = picked.id` | ||
| res, err := s.ds.ExecContext(ctx, q, batchLimit) | ||
| if err != nil { | ||
| return 0, fmt.Errorf("failed to purge delivered chip durable events: %w", err) | ||
| } | ||
| n, err := res.RowsAffected() | ||
| if err != nil { | ||
| return 0, fmt.Errorf("purge delivered rows affected: %w", err) | ||
| } | ||
| return n, nil | ||
| } | ||
|
|
||
| func (s *Store) ListPending(ctx context.Context, createdBefore time.Time, limit int) ([]beholder.DurableEvent, error) { | ||
| const q = ` | ||
| SELECT id, payload, created_at | ||
| FROM ` + chipDurableEventsTable + ` | ||
| WHERE delivered_at IS NULL | ||
| AND created_at < $1 | ||
| ORDER BY created_at ASC | ||
| LIMIT $2` | ||
|
|
||
| type row struct { | ||
| ID int64 `db:"id"` | ||
| Payload []byte `db:"payload"` | ||
| CreatedAt time.Time `db:"created_at"` | ||
| } | ||
|
|
||
| var rows []row | ||
| if err := s.ds.SelectContext(ctx, &rows, q, createdBefore, limit); err != nil { | ||
| return nil, fmt.Errorf("failed to list pending chip durable events: %w", err) | ||
| } | ||
|
|
||
| out := make([]beholder.DurableEvent, 0, len(rows)) | ||
| for _, r := range rows { | ||
| out = append(out, beholder.DurableEvent{ | ||
| ID: r.ID, | ||
| Payload: r.Payload, | ||
| CreatedAt: r.CreatedAt, | ||
| }) | ||
| } | ||
| return out, nil | ||
| } | ||
|
|
||
| func (s *Store) DeleteExpired(ctx context.Context, ttl time.Duration) (int64, error) { | ||
| const q = ` | ||
| WITH deleted AS ( | ||
| DELETE FROM ` + chipDurableEventsTable + ` | ||
| WHERE created_at <= now() - $1::interval | ||
| RETURNING id | ||
| ) | ||
| SELECT count(*) FROM deleted` | ||
|
|
||
| var count int64 | ||
| if err := s.ds.GetContext(ctx, &count, q, ttl.String()); err != nil { | ||
| return 0, fmt.Errorf("failed to delete expired chip durable events: %w", err) | ||
| } | ||
| return count, nil | ||
| } | ||
|
|
||
| type chipDurableQueueAgg struct { | ||
| Cnt int64 `db:"cnt"` | ||
| PayloadSum int64 `db:"payload_sum"` | ||
| MinCreated *time.Time `db:"min_created"` | ||
| } | ||
|
|
||
| // ObserveDurableQueue implements beholder.DurableQueueObserver for queue depth / age gauges. | ||
| func (s *Store) ObserveDurableQueue(ctx context.Context, eventTTL, nearExpiryLead time.Duration) (beholder.DurableQueueStats, error) { | ||
| const qAgg = ` | ||
| SELECT | ||
| count(*)::bigint AS cnt, | ||
| coalesce(sum(octet_length(payload)), 0)::bigint AS payload_sum, | ||
| min(created_at) AS min_created | ||
| FROM ` + chipDurableEventsTable + ` | ||
| WHERE delivered_at IS NULL` | ||
|
|
||
| var row chipDurableQueueAgg | ||
| if err := s.ds.GetContext(ctx, &row, qAgg); err != nil { | ||
| return beholder.DurableQueueStats{}, fmt.Errorf("durable queue aggregate: %w", err) | ||
| } | ||
| var st beholder.DurableQueueStats | ||
| st.Depth = row.Cnt | ||
| st.PayloadBytes = row.PayloadSum | ||
| if row.MinCreated != nil { | ||
| st.OldestPendingAge = time.Since(*row.MinCreated) | ||
| } | ||
| if eventTTL > 0 && nearExpiryLead > 0 && nearExpiryLead < eventTTL { | ||
| ttlSec := int64(eventTTL.Round(time.Second) / time.Second) | ||
| leadSec := int64(nearExpiryLead.Round(time.Second) / time.Second) | ||
| const qNear = ` | ||
| SELECT count(*)::bigint | ||
| FROM ` + chipDurableEventsTable + ` | ||
| WHERE delivered_at IS NULL | ||
| AND created_at >= now() - ($1::bigint * interval '1 second') | ||
| AND created_at < now() - (($1::bigint - $2::bigint) * interval '1 second')` | ||
| if err := s.ds.GetContext(ctx, &st.NearTTLCount, qNear, ttlSec, leadSec); err != nil { | ||
| return beholder.DurableQueueStats{}, fmt.Errorf("durable queue near-ttl: %w", err) | ||
| } | ||
| } | ||
| return st, nil | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@DylanTinianov
Observability Comparison: DurableEmitter vs BatchEmitterService + batch.Client
Metrics Coverage
chip_ingress.events_sentper domain/entitybeholder.durable_emitter.emit.success(count only, no domain/entity breakdown)chip_ingress.events_droppedper domain/entitybeholder.durable_emitter.emit.failure(no domain/entity)chip_ingress.batch.send_requests_totalwith status=success/failurechip_ingress.batch.request_size_messageshistogram with max_batch_size attrchip_ingress.batch.request_size_byteshistogram with max_grpc_request_size attrchip_ingress.batch.request_latency_mshistogram with statusOnBatchPublish) — not exported as a metric instrumentchip_ingress.batch.config.infowith all config attrsqueueDepth/queueDepthMaxgaugesemit.duration+emit.total_durationhistogramsexpiredPurged,NearTTLCountviaObserveDurableQueuepublishBatchEvOK/publishBatchEvErrKey Gaps in DurableEmitter
No per-domain/entity attribution — BatchEmitterService tags every metric with
domain+entityviametricAttrsFor(). DurableEmitter metrics are flat counters with no cardinality, making it impossible to identify which event source is failing.No request size observability — batch.Client records both message count and byte size histograms per send. DurableEmitter has no visibility into whether batches are approaching gRPC limits (which ties back to the missing size-splitting gap).
No send request counter with status — batch.Client's
send_requests_totalwith success/failure lets you compute error rate. DurableEmitter only logs failures; no metric-based alerting possible on publish RPC error rate.No config info metric — batch.Client emits a gauge with all configuration parameters (batch size, buffer size, timeouts, etc.) for runtime introspection. DurableEmitter has no equivalent — you can't verify running config from metrics alone.
Latency only via hooks, not instruments —
OnBatchPublishcallback provides latency to test code but doesn't register an OTel histogram. You can't dashboard or alert on publish latency without custom wiring.Where DurableEmitter is Better
Logging
Errorw("failed to publish batch")Warnw("PublishBatch failed, events will be retransmitted")Warnw("batch publish channel full, relying on retransmit")Warnw("timed out waiting for shutdown")Infowlog lines (coalescing, raw-codec)Summary
BatchEmitterService has better transport-layer observability (request sizes, latency histograms, per-domain attribution, config gauges). DurableEmitter has better persistence-layer observability (queue depth, DB operation metrics, emit latency). If DurableEmitter composed over BatchEmitterService, you'd get both layers covered without duplication.