-
-
Notifications
You must be signed in to change notification settings - Fork 759
fix: use task-stream index instead of wall clock in get_task_stream context manager (#9253) #9282
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -5000,6 +5000,7 @@ def get_task_stream( | |||||
| plot=False, | ||||||
| filename="task-stream.html", | ||||||
| bokeh_resources=None, | ||||||
| start_index=None, | ||||||
| ): | ||||||
| """Get task stream data from scheduler | ||||||
|
|
||||||
|
|
@@ -5070,6 +5071,7 @@ def get_task_stream( | |||||
| plot=plot, | ||||||
| filename=filename, | ||||||
| bokeh_resources=bokeh_resources, | ||||||
| start_index=start_index, | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| ) | ||||||
|
|
||||||
| async def _get_task_stream( | ||||||
|
|
@@ -5080,8 +5082,11 @@ async def _get_task_stream( | |||||
| plot=False, | ||||||
| filename="task-stream.html", | ||||||
| bokeh_resources=None, | ||||||
| start_index=None, | ||||||
| ): | ||||||
| msgs = await self.scheduler.get_task_stream(start=start, stop=stop, count=count) | ||||||
| msgs = await self.scheduler.get_task_stream( | ||||||
| start=start, stop=stop, count=count, start_index=start_index | ||||||
| ) | ||||||
| if plot: | ||||||
| from distributed.diagnostics.task_stream import rectangles | ||||||
|
|
||||||
|
|
@@ -6080,39 +6085,33 @@ def __init__(self, client=None, plot=False, filename="task-stream.html"): | |||||
| self._filename = filename | ||||||
| self.figure = None | ||||||
| self.client = client or default_client() | ||||||
| self._init = False | ||||||
| self._start_index = None | ||||||
|
|
||||||
| def __enter__(self): | ||||||
| if not self._init: | ||||||
| self.client.get_task_stream(start=0, stop=0) # ensure plugin | ||||||
| self._init = True | ||||||
|
|
||||||
| # Smooth over time differences of client vs. workers | ||||||
| # FIXME this is very crude. We should query TaskStreamPlugin.index instead. | ||||||
| self.start = time() - 0.1 | ||||||
| # Record the scheduler's task-stream cursor on entry and collect | ||||||
| # everything appended after it on exit. Using the monotonic index | ||||||
| # instead of a wall-clock boundary avoids dropping tasks when there is | ||||||
| # latency or clock skew between the client and the workers. | ||||||
| self._start_index = self.client.sync( | ||||||
| self.client.scheduler.get_task_stream_index | ||||||
| ) | ||||||
| return self | ||||||
|
|
||||||
| def __exit__(self, exc_type, exc_value, traceback): | ||||||
| L = self.client.get_task_stream( | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| start=self.start, plot=self._plot, filename=self._filename | ||||||
| start_index=self._start_index, plot=self._plot, filename=self._filename | ||||||
| ) | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| if self._plot: | ||||||
| L, self.figure = L | ||||||
| self.data.extend(L) | ||||||
|
|
||||||
| async def __aenter__(self): | ||||||
| if not self._init: | ||||||
| await self.client.get_task_stream(start=0, stop=0) # ensure plugin | ||||||
| self._init = True | ||||||
|
|
||||||
| # Smooth over time differences of client vs. workers | ||||||
| # FIXME this is very crude. We should query TaskStreamPlugin.index instead. | ||||||
| self.start = time() - 0.1 | ||||||
| self._start_index = await self.client.scheduler.get_task_stream_index() | ||||||
| return self | ||||||
|
|
||||||
| async def __aexit__(self, exc_type, exc_value, traceback): | ||||||
| L = await self.client.get_task_stream( | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| start=self.start, plot=self._plot, filename=self._filename | ||||||
| start_index=self._start_index, plot=self._plot, filename=self._filename | ||||||
| ) | ||||||
| if self._plot: | ||||||
| L, self.figure = L | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,7 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from collections import deque | ||
|
|
||
| import pytest | ||
| from tlz import frequencies | ||
|
|
||
|
|
@@ -82,6 +84,45 @@ async def test_collect(c, s, a, b): | |
| assert tasks.collect(start=start, count=3) == list(tasks.buffer)[:3] | ||
|
|
||
|
|
||
| @gen_cluster(client=True) | ||
| async def test_collect_start_index(c, s, a, b): | ||
| tasks = TaskStreamPlugin(s) | ||
| s.add_plugin(tasks) | ||
|
|
||
| futures = c.map(slowinc, range(5), delay=0.05) | ||
| await wait(futures) | ||
| midpoint = tasks.index | ||
|
|
||
| futures = c.map(slowinc, range(5, 10), delay=0.05) | ||
| await wait(futures) | ||
|
|
||
| # ``start_index`` selects by append position, not wall-clock time, so it | ||
| # returns exactly the records appended at or after the given index. | ||
| assert len(tasks.collect(start_index=0)) == 10 | ||
| assert len(tasks.collect(start_index=midpoint)) == 5 | ||
| assert len(tasks.collect(start_index=tasks.index)) == 0 | ||
|
|
||
|
|
||
| def test_collect_start_index_ignores_clock(): | ||
| # When the worker clock lags the client clock (or there is latency), a task | ||
| # can finish with a recorded stop time that is earlier than the client's | ||
| # ``start`` boundary. The time-based collection then drops the task, which | ||
| # is the latency/clock-skew failure from the original bug report. The | ||
| # index-based path must still return it. | ||
| plugin = TaskStreamPlugin.__new__(TaskStreamPlugin) | ||
| plugin.buffer = deque() | ||
| plugin.index = 0 | ||
|
|
||
| now = time() | ||
| plugin.buffer.append({"key": "task", "startstops": [{"stop": now - 100}]}) | ||
| plugin.index += 1 | ||
|
|
||
| # Time-based collection misses the task because its stop time is in the past. | ||
| assert plugin.collect(start=now) == [] | ||
| # Index-based collection captures it regardless of the clock. | ||
| assert len(plugin.collect(start_index=0)) == 1 | ||
|
|
||
|
|
||
|
Comment on lines
+106
to
+125
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is very hacky and IMHO over-engineered; I'd rather stick to the public API whenever possible. |
||
| @gen_cluster(client=True) | ||
| async def test_client(c, s, a, b): | ||
| await c.get_task_stream() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should not be exposed in the public API