-
-
Notifications
You must be signed in to change notification settings - Fork 0
feat(streams): Consumer watermark commit latency metric #314
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c6c6862
4a9756b
a83b271
e8a32dc
2d91d19
90b6fe0
19c6aea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,14 +7,22 @@ use sentry_arroyo::processing::strategies::{ | |
| use sentry_arroyo::types::{Message, Partition}; | ||
|
|
||
| use crate::messages::{RoutedValuePayload, WatermarkMessage}; | ||
| #[cfg(test)] | ||
| use crate::mocks::current_epoch; | ||
| use crate::routes::RoutedValue; | ||
| #[cfg(not(test))] | ||
| use crate::time_helpers::current_epoch; | ||
|
|
||
| /// Histogram: seconds from watermark `last_message_time` (or 0 if absent) to commit decision. | ||
| const METRIC_WATERMARK_COMMIT_LATENCY: &str = "streams.pipeline.consumer.watermark_commit_latency"; | ||
|
|
||
| /// Records the committable of a received Watermark and records how many times that watermark has been seen. | ||
| #[derive(Clone, Debug)] | ||
| struct WatermarkTracker { | ||
| num_watermarks: u64, | ||
| committable: HashMap<Partition, u64>, | ||
| time_added: Instant, | ||
| last_message_time: Option<f64>, | ||
| } | ||
|
|
||
| /// WatermarkCommitOffsets is a commit policy that only commits once it receives a copy of a Watermark | ||
|
|
@@ -52,13 +60,25 @@ impl WatermarkCommitOffsets { | |
| }; | ||
| let mut to_remove = vec![]; | ||
| let mut commit_request = empty_commit_request.clone(); | ||
| // Track the oldest (minimum) last_message_time across all watermarks that contribute | ||
| // to the merged commit, so we record latency once per commit() based on the | ||
| // watermark furthest behind. | ||
| let mut oldest_last_message_time: Option<f64> = None; | ||
| for (ts, watermark) in self.watermarks.iter() { | ||
| if watermark.num_watermarks == self.num_branches { | ||
| let current_request = CommitRequest { | ||
| positions: watermark.committable.clone(), | ||
| }; | ||
| commit_request = | ||
| merge_commit_request(Some(commit_request), Some(current_request)).unwrap(); | ||
|
|
||
| if let Some(t) = watermark.last_message_time { | ||
| oldest_last_message_time = Some(match oldest_last_message_time { | ||
| Some(prev) => prev.min(t), | ||
| None => t, | ||
| }); | ||
| } | ||
|
|
||
|
fpacifici marked this conversation as resolved.
|
||
| to_remove.push(ts.clone()); | ||
| // Clean up any hanging watermarks which still haven't gotten all their copies in 5 min | ||
| // from when the first copy was seen | ||
|
|
@@ -71,6 +91,10 @@ impl WatermarkCommitOffsets { | |
| } | ||
|
|
||
| if commit_request != empty_commit_request { | ||
| let secs = oldest_last_message_time | ||
| .map(|t| ((current_epoch() as f64) - t).max(0.0)) | ||
| .unwrap_or(0.0); | ||
| metrics::histogram!(METRIC_WATERMARK_COMMIT_LATENCY).record(secs); | ||
|
cursor[bot] marked this conversation as resolved.
fpacifici marked this conversation as resolved.
|
||
| Some(commit_request) | ||
| } else { | ||
| None | ||
|
|
@@ -94,7 +118,8 @@ impl ProcessingStrategy<RoutedValue> for WatermarkCommitOffsets { | |
| WatermarkTracker { | ||
| num_watermarks: tracker.num_watermarks + 1, | ||
| committable: tracker.committable.clone(), | ||
| time_added: tracker.time_added.clone(), | ||
| time_added: tracker.time_added, | ||
| last_message_time: tracker.last_message_time, | ||
| }, | ||
| ); | ||
| } | ||
|
Comment on lines
118
to
125
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: When two watermarks have the same timestamp, the Suggested FixWhen a watermark arrives with a timestamp that is already being tracked, update the tracker's Prompt for AI Agent |
||
|
|
@@ -109,6 +134,7 @@ impl ProcessingStrategy<RoutedValue> for WatermarkCommitOffsets { | |
| num_watermarks: 1, | ||
| committable: committable, | ||
| time_added: Instant::now(), | ||
| last_message_time: watermark.last_message_time, | ||
| }, | ||
| ); | ||
| } | ||
|
|
@@ -129,15 +155,60 @@ impl ProcessingStrategy<RoutedValue> for WatermarkCommitOffsets { | |
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use crate::{messages::Watermark, routes::Route, testutils::make_committable}; | ||
| use crate::messages::Watermark; | ||
| use crate::mocks::set_timestamp; | ||
| use crate::{routes::Route, testutils::make_committable}; | ||
|
|
||
| use metrics::{Key, KeyName, Metadata, Recorder, SharedString, Unit}; | ||
| use std::sync::{Arc, Mutex}; | ||
|
|
||
| use super::*; | ||
|
|
||
| /// Minimal histogram-only recorder modeled on `pipeline_stats::tests::CaptureRecorder`. | ||
| #[derive(Default)] | ||
| struct CaptureRecorder { | ||
| histograms: Arc<Mutex<Vec<(Key, f64)>>>, | ||
| } | ||
|
|
||
| impl Recorder for CaptureRecorder { | ||
| fn describe_counter(&self, _: KeyName, _: Option<Unit>, _: SharedString) {} | ||
| fn describe_gauge(&self, _: KeyName, _: Option<Unit>, _: SharedString) {} | ||
| fn describe_histogram(&self, _: KeyName, _: Option<Unit>, _: SharedString) {} | ||
| fn register_counter(&self, _: &Key, _: &Metadata<'_>) -> metrics::Counter { | ||
| metrics::Counter::noop() | ||
| } | ||
| fn register_gauge(&self, _: &Key, _: &Metadata<'_>) -> metrics::Gauge { | ||
| metrics::Gauge::noop() | ||
| } | ||
| fn register_histogram(&self, key: &Key, _: &Metadata<'_>) -> metrics::Histogram { | ||
| metrics::Histogram::from_arc(Arc::new(CaptureHistogram { | ||
| key: key.clone(), | ||
| histograms: Arc::clone(&self.histograms), | ||
| })) | ||
| } | ||
| } | ||
|
|
||
| struct CaptureHistogram { | ||
| key: Key, | ||
| histograms: Arc<Mutex<Vec<(Key, f64)>>>, | ||
| } | ||
|
|
||
| impl metrics::HistogramFn for CaptureHistogram { | ||
| fn record(&self, value: f64) { | ||
| self.histograms | ||
| .lock() | ||
| .unwrap() | ||
| .push((self.key.clone(), value)); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_commit_offsets() { | ||
| // Pin current_epoch so the latency metric is deterministic. | ||
| set_timestamp(100); | ||
| let mut commit_step = WatermarkCommitOffsets::new(2); | ||
|
|
||
| let watermark = Watermark::new(make_committable(3, 0), 0); | ||
| let watermark = Watermark::with_last_message_time(make_committable(3, 0), 0, Some(80.0)); | ||
| let mut messages = vec![]; | ||
| for waypoint in ["route1", "route2"] { | ||
| messages.push(Message::new_any_message( | ||
|
|
@@ -165,11 +236,37 @@ mod tests { | |
| ); | ||
| } | ||
|
|
||
| // Second watermark actually returns CommitRequest on poll() | ||
| // Second watermark actually returns CommitRequest on poll() and records the latency | ||
| // metric once based on the (only) tracker's last_message_time. | ||
| let _ = commit_step.submit(messages[1].clone()); | ||
| assert_eq!(commit_step.watermarks[&ts].num_watermarks, 2); | ||
| if let Ok(None) = commit_step.poll() { | ||
| panic!("Commit step returned didn't return CommitRequest with 2 watermarks"); | ||
| } | ||
|
|
||
| let histograms = Arc::new(Mutex::new(Vec::<(Key, f64)>::new())); | ||
| let recorder = CaptureRecorder { | ||
| histograms: Arc::clone(&histograms), | ||
| }; | ||
| let result = { | ||
| let _guard = metrics::set_default_local_recorder(&recorder); | ||
| commit_step.poll() | ||
| }; | ||
| assert!( | ||
| matches!(result, Ok(Some(_))), | ||
| "Commit step returned didn't return CommitRequest with 2 watermarks" | ||
| ); | ||
|
|
||
| let recorded: Vec<f64> = histograms | ||
| .lock() | ||
| .unwrap() | ||
| .iter() | ||
| .filter(|(k, _)| k.name() == METRIC_WATERMARK_COMMIT_LATENCY) | ||
| .map(|(_, v)| *v) | ||
| .collect(); | ||
| assert_eq!( | ||
| recorded, | ||
| vec![20.0], | ||
| "expected exactly one latency sample of (current_epoch - last_message_time)" | ||
| ); | ||
|
|
||
| set_timestamp(0); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.