Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
2749a44
first stab
maurycy Mar 21, 2026
f13d34c
s/ndjson/jsonl/
maurycy Mar 21, 2026
c15d318
printing to stdout isn't a great idea
maurycy Mar 22, 2026
6a0ea81
Merge remote-tracking branch 'upstream/main' into tachyon-ndjson-kole…
maurycy Mar 30, 2026
cb27fc0
even a basic test
maurycy Mar 30, 2026
59cbb4a
separate func for end record
maurycy Mar 30, 2026
25c6922
proper name
maurycy Mar 30, 2026
67cd39a
test_jsonl_collector_with_location_info
maurycy Mar 31, 2026
7c85d47
test synthetic frames
maurycy Mar 31, 2026
3eddae8
too many new lines
maurycy Mar 31, 2026
f71252e
BUG? confusing... two ways to set skip_idle?
maurycy Mar 31, 2026
9836ffa
Merge branch 'main' into tachyon-ndjson-kolektor
maurycy Mar 31, 2026
c183109
ok, thx b4fac15613a16f9cd7b2ee32840523b399f4621f
maurycy Mar 31, 2026
f20eb52
check if it works fine with (file, loc, func, op)
maurycy Mar 31, 2026
546ce90
missing new line
maurycy Mar 31, 2026
350ad99
filter out sync coordinator
maurycy Mar 31, 2026
942d821
s/collapsed_out/jsonl_out/, less copying :D
maurycy Mar 31, 2026
bd9aefe
nicer reading
maurycy Mar 31, 2026
311a4e3
typo
maurycy Mar 31, 2026
749a868
too much copying, left-over
maurycy Mar 31, 2026
85ce978
just Counter
maurycy Mar 31, 2026
820d3b9
ruff
maurycy Mar 31, 2026
aad4b18
future-proof name
maurycy Mar 31, 2026
da3e754
future-proof iter for streaming
maurycy Mar 31, 2026
cb6ed34
truth to be told, this should be layer above
maurycy Mar 31, 2026
5a59e0b
helper
maurycy Mar 31, 2026
192e54b
reorder
maurycy Mar 31, 2026
3189a8f
eh, just copy from heatmap
maurycy Mar 31, 2026
935779f
smaller chunk; matter of taste
maurycy Mar 31, 2026
e3d8aff
test actual chunking
maurycy Mar 31, 2026
d37f07a
test edge cases
maurycy Mar 31, 2026
aaaa972
ruff
maurycy Mar 31, 2026
a9b6ccd
match pep8
maurycy Mar 31, 2026
4fb3ade
style
maurycy Mar 31, 2026
a0decb5
too defensive
maurycy Mar 31, 2026
5f1704b
too many style changes
maurycy Mar 31, 2026
f2a21fb
less style
maurycy Mar 31, 2026
15b07ba
ha! even less style...
maurycy Mar 31, 2026
148f4e2
news
maurycy Mar 31, 2026
69c5768
news: proper formatting
maurycy Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Lib/profiling/sampling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
from .stack_collector import CollapsedStackCollector
from .heatmap_collector import HeatmapCollector
from .gecko_collector import GeckoCollector
from .jsonl_collector import JsonlCollector
from .string_table import StringTable

__all__ = ("Collector", "PstatsCollector", "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", "StringTable")
__all__ = (
"Collector",
"PstatsCollector",
"CollapsedStackCollector",
"HeatmapCollector",
"GeckoCollector",
"JsonlCollector",
"StringTable",
)
3 changes: 3 additions & 0 deletions Lib/profiling/sampling/binary_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .gecko_collector import GeckoCollector
from .stack_collector import FlamegraphCollector, CollapsedStackCollector
from .jsonl_collector import JsonlCollector
from .pstats_collector import PstatsCollector


Expand Down Expand Up @@ -117,6 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format,
collector = PstatsCollector(interval)
elif output_format == 'gecko':
collector = GeckoCollector(interval)
elif output_format == "jsonl":
collector = JsonlCollector(interval)
else:
raise ValueError(f"Unknown output format: {output_format}")

Expand Down
25 changes: 21 additions & 4 deletions Lib/profiling/sampling/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .stack_collector import CollapsedStackCollector, FlamegraphCollector, DiffFlamegraphCollector
from .heatmap_collector import HeatmapCollector
from .gecko_collector import GeckoCollector
from .jsonl_collector import JsonlCollector
from .binary_collector import BinaryCollector
from .binary_reader import BinaryReader
from .constants import (
Expand Down Expand Up @@ -95,6 +96,7 @@ def __call__(self, parser, namespace, values, option_string=None):
"diff_flamegraph": "html",
"gecko": "json",
"heatmap": "html",
"jsonl": "jsonl",
"binary": "bin",
}

Expand All @@ -105,6 +107,7 @@ def __call__(self, parser, namespace, values, option_string=None):
"diff_flamegraph": DiffFlamegraphCollector,
"gecko": GeckoCollector,
"heatmap": HeatmapCollector,
"jsonl": JsonlCollector,
"binary": BinaryCollector,
}

Expand Down Expand Up @@ -482,6 +485,13 @@ def _add_format_options(parser, include_compression=True, include_binary=True):
action=DiffFlamegraphAction,
help="Generate differential flamegraph comparing current profile to BASELINE binary file",
)
format_group.add_argument(
"--jsonl",
action="store_const",
const="jsonl",
dest="format",
help="Generate JSONL snapshot output for external consumers",
)
if include_binary:
format_group.add_argument(
"--binary",
Expand Down Expand Up @@ -560,15 +570,17 @@ def _sort_to_mode(sort_choice):
return sort_map.get(sort_choice, SORT_MODE_NSAMPLES)

def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=False,
output_file=None, compression='auto', diff_baseline=None):
mode=None, output_file=None, compression='auto', diff_baseline=None):
Comment on lines 572 to +573
Copy link
Copy Markdown
Contributor Author

@maurycy maurycy Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already very complex. The collector constructor signature supports all collectors at once.

I've added mode for the purpose of meta but I don't think this scales for other meta.

(Truth to be told, I think that that complex signatures are also the underlying reason for the issue fixed by #145459)

"""Create the appropriate collector based on format type.

Args:
format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary', 'diff_flamegraph')
format_type: The output format ('pstats', 'collapsed', 'flamegraph',
'gecko', 'heatmap', 'jsonl', 'binary', 'diff_flamegraph')
sample_interval_usec: Sampling interval in microseconds
skip_idle: Whether to skip idle samples
opcodes: Whether to collect opcode information (only used by gecko format
for creating interval markers in Firefox Profiler)
mode: Profiling mode for collectors that expose it in metadata
output_file: Output file path (required for binary format)
compression: Compression type for binary format ('auto', 'zstd', 'none')
diff_baseline: Path to baseline binary file for differential flamegraph
Expand Down Expand Up @@ -604,6 +616,11 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals
skip_idle = False
return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes)

if format_type == "jsonl":
return collector_class(
sample_interval_usec, skip_idle=skip_idle, mode=mode
)

return collector_class(sample_interval_usec, skip_idle=skip_idle)


Expand Down Expand Up @@ -978,7 +995,7 @@ def _handle_attach(args):

# Create the appropriate collector
collector = _create_collector(
args.format, args.sample_interval_usec, skip_idle, args.opcodes,
args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode,
output_file=output_file,
compression=getattr(args, 'compression', 'auto'),
diff_baseline=args.diff_baseline
Expand Down Expand Up @@ -1057,7 +1074,7 @@ def _handle_run(args):

# Create the appropriate collector
collector = _create_collector(
args.format, args.sample_interval_usec, skip_idle, args.opcodes,
args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode,
output_file=output_file,
compression=getattr(args, 'compression', 'auto'),
diff_baseline=args.diff_baseline
Expand Down
5 changes: 4 additions & 1 deletion Lib/profiling/sampling/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@ def normalize_location(location):
"""Normalize location to a 4-tuple format.
Args:
location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None
location: tuple (lineno, end_lineno, col_offset, end_col_offset),
an integer line number, or None
Returns:
tuple: (lineno, end_lineno, col_offset, end_col_offset)
"""
if location is None:
return DEFAULT_LOCATION
if isinstance(location, int):
return (location, location, -1, -1)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return location


Expand Down
200 changes: 200 additions & 0 deletions Lib/profiling/sampling/jsonl_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
"""JSONL collector."""

from collections import Counter
import json
import uuid
from itertools import batched

from .constants import (
PROFILING_MODE_ALL,
PROFILING_MODE_CPU,
PROFILING_MODE_EXCEPTION,
PROFILING_MODE_GIL,
PROFILING_MODE_WALL,
)
from .collector import normalize_location
from .stack_collector import StackTraceCollector


_CHUNK_SIZE = 256
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


_MODE_NAMES = {
PROFILING_MODE_WALL: "wall",
PROFILING_MODE_CPU: "cpu",
PROFILING_MODE_GIL: "gil",
PROFILING_MODE_ALL: "all",
PROFILING_MODE_EXCEPTION: "exception",
}


class JsonlCollector(StackTraceCollector):
Copy link
Copy Markdown
Contributor Author

@maurycy maurycy Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the collectors should be separate from renderers?

"""Collector that exports finalized profiling data as JSONL."""

def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None):
super().__init__(sample_interval_usec, skip_idle=skip_idle)
self.run_id = uuid.uuid4().hex

self._string_to_id = {}
self._strings = []

self._frame_to_id = {}
self._frames = []

self._frame_self = Counter()
self._frame_cumulative = Counter()
self._samples_total = 0
self._seen_frame_ids = set()

self._mode = mode

def process_frames(self, frames, _thread_id, weight=1):
self._samples_total += weight
self._seen_frame_ids.clear()

for i, (filename, location, funcname, _opcode) in enumerate(frames):
frame_id = self._get_or_create_frame_id(
filename, location, funcname
)
is_leaf = i == 0
count_cumulative = frame_id not in self._seen_frame_ids

if count_cumulative:
self._seen_frame_ids.add(frame_id)

if is_leaf:
self._frame_self[frame_id] += weight

if count_cumulative:
self._frame_cumulative[frame_id] += weight

def export(self, filename):
with open(filename, "w", encoding="utf-8") as output:
self._write_message(output, self._build_meta_record())
self._write_chunked_records(
output,
{"type": "str_def", "v": 1, "run_id": self.run_id},
"defs",
self._strings,
)
self._write_chunked_records(
output,
{"type": "frame_def", "v": 1, "run_id": self.run_id},
"defs",
self._frames,
)
self._write_chunked_records(
output,
{
"type": "agg",
"v": 1,
"run_id": self.run_id,
"kind": "frame",
"scope": "final",
Copy link
Copy Markdown
Contributor Author

@maurycy maurycy Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The very big thing here is ensuring that the format is future-proof. That's the reason for v.

For example, in the future: "window" for streaming, including timestamps.

"samples_total": self._samples_total,
},
"entries",
self._iter_final_agg_entries(),
)
self._write_message(output, self._build_end_record())

def _build_meta_record(self):
record = {
"type": "meta",
"v": 1,
"run_id": self.run_id,
"sample_interval_usec": self.sample_interval_usec,
}

if self._mode is not None:
record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode))

return record

def _build_end_record(self):
record = {
"type": "end",
"v": 1,
"run_id": self.run_id,
"samples_total": self._samples_total,
}

return record

def _iter_final_agg_entries(self):
for frame_record in self._frames:
frame_id = frame_record["frame_id"]
yield {
"frame_id": frame_id,
"self": self._frame_self[frame_id],
"cumulative": self._frame_cumulative[frame_id],
}

def _get_or_create_frame_id(self, filename, location, funcname):
synthetic = location is None
location_fields = self._location_to_export_fields(location)
func_str_id = self._intern_string(funcname)
path_str_id = self._intern_string(filename)

frame_key = (
path_str_id,
func_str_id,
location_fields["line"],
location_fields.get("end_line"),
location_fields.get("col"),
location_fields.get("end_col"),
synthetic,
)

if (frame_id := self._frame_to_id.get(frame_key)) is not None:
return frame_id

frame_id = len(self._frames) + 1
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 or 0 indexed?

frame_record = {
"frame_id": frame_id,
"path_str_id": path_str_id,
"func_str_id": func_str_id,
**location_fields,
}
if synthetic:
frame_record["synthetic"] = True

self._frame_to_id[frame_key] = frame_id
self._frames.append(frame_record)
return frame_id

def _intern_string(self, value):
value = str(value)

if (string_id := self._string_to_id.get(value)) is not None:
return string_id

string_id = len(self._strings) + 1
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 or 0 indexed?

self._string_to_id[value] = string_id
self._strings.append({"str_id": string_id, "value": value})
return string_id

@staticmethod
def _location_to_export_fields(location):
lineno, end_lineno, col_offset, end_col_offset = normalize_location(
location
)

fields = {"line": lineno}
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it be -1 or 0 for synthetic?

To quote the Markdown:

lineno = -1: Synthetic frame (no source location)

On the other hand, DEFAULT_LINE settled on 0:

https://github.com/python/cpython/blob/main/Lib/profiling/sampling/constants.py#L24

This is the reason for adding the synthetic here:

https://github.com/python/cpython/pull/146257/changes#diff-58ccdb8421c89943862c73d1cbeae3e961873b55ed2adb7efc875dafd549c01bR162

if end_lineno > 0:
fields["end_line"] = end_lineno
if col_offset >= 0:
fields["col"] = col_offset
if end_col_offset >= 0:
fields["end_col"] = end_col_offset
return fields

def _write_chunked_records(
self, output, base_record, chunk_field, entries
):
for chunk in batched(entries, _CHUNK_SIZE):
self._write_message(output, {**base_record, chunk_field: chunk})

@staticmethod
def _write_message(output, record):
output.write(json.dumps(record, separators=(",", ":")))
output.write("\n")
Loading
Loading