diff --git a/CHANGELOG.md b/CHANGELOG.md index 686d33c59..10513cab5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ **Features**: +- Native: add opt-in async crash upload mode so crashed apps can exit early after crash data is captured, while the crash daemon finishes potentially large uploads in the background. ([#1739](https://github.com/getsentry/sentry-native/pull/1739)) - Add a `transfer_timeout` option for SDK-managed HTTP transports. ([#1741](https://github.com/getsentry/sentry-native/pull/1741)) **Fixes**: diff --git a/examples/example.c b/examples/example.c index 372dfc4b1..16c2edd22 100644 --- a/examples/example.c +++ b/examples/example.c @@ -790,6 +790,10 @@ main(int argc, char **argv) } } } + if (has_arg(argc, argv, "async-crash-upload")) { + sentry_options_set_crash_upload_mode( + options, SENTRY_CRASH_UPLOAD_MODE_ASYNC); + } // E2E test mode: generate unique test ID for event correlation char e2e_test_id[37] = { 0 }; diff --git a/include/sentry.h b/include/sentry.h index 7f7dcecdf..18c67ebe5 100644 --- a/include/sentry.h +++ b/include/sentry.h @@ -1073,6 +1073,25 @@ typedef enum { SENTRY_CRASH_REPORTING_MODE_NATIVE_WITH_MINIDUMP = 2, } sentry_crash_reporting_mode_t; +/** + * Crash upload mode for the native backend. + * Controls whether the crashed application remains blocked while upload and + * shutdown work finishes after crash data has been captured. + */ +typedef enum { + /** + * Keep the crashed application blocked until the native crash daemon + * finishes upload and shutdown work. + */ + SENTRY_CRASH_UPLOAD_MODE_SYNC = 0, + + /** + * Allow the crashed application to terminate after crash data has been + * captured. The native crash daemon continues upload and shutdown work. + */ + SENTRY_CRASH_UPLOAD_MODE_ASYNC = 1, +} sentry_crash_upload_mode_t; + /** * Controls if and when envelopes are kept in the persistent cache. */ @@ -1882,6 +1901,27 @@ SENTRY_API void sentry_options_set_crash_reporting_mode( SENTRY_API sentry_crash_reporting_mode_t sentry_options_get_crash_reporting_mode(const sentry_options_t *opts); +/** + * Sets the crash upload mode for the native backend. + * + * This setting controls what happens after crash data has been captured. In + * sync mode, the crashed application remains blocked while the native crash + * daemon finishes upload and shutdown work. In async mode, the crashed + * application can terminate after crash data has been captured while the daemon + * continues upload and shutdown work. + * + * This setting only has an effect when using the `native` backend. + * Default is `SENTRY_CRASH_UPLOAD_MODE_SYNC`. + */ +SENTRY_API void sentry_options_set_crash_upload_mode( + sentry_options_t *opts, sentry_crash_upload_mode_t mode); + +/** + * Gets the crash upload mode for the native backend. + */ +SENTRY_API sentry_crash_upload_mode_t sentry_options_get_crash_upload_mode( + const sentry_options_t *opts); + /** * Enables a wait for the crash report upload to be finished before shutting * down. This is disabled by default. diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index a2d6bba04..c02b1a1f1 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -122,7 +122,8 @@ typedef enum { SENTRY_CRASH_STATE_READY = 0, SENTRY_CRASH_STATE_CRASHED = 1, SENTRY_CRASH_STATE_PROCESSING = 2, - SENTRY_CRASH_STATE_DONE = 3 + SENTRY_CRASH_STATE_CAPTURED = 3, + SENTRY_CRASH_STATE_DONE = 4 } sentry_crash_state_t; /** @@ -274,6 +275,7 @@ typedef struct { // Configuration (set by app during init) sentry_minidump_mode_t minidump_mode; int crash_reporting_mode; // sentry_crash_reporting_mode_t + int crash_upload_mode; // sentry_crash_upload_mode_t bool debug_enabled; // Debug logging enabled in parent process bool attach_screenshot; // Screenshot attachment enabled in parent process bool attach_session_replay; // Session replay attachment enabled in parent diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index d45fcb25b..f918d565e 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2805,10 +2805,11 @@ write_envelope_with_minidump(const sentry_options_t *options, * * Called by the crash daemon (out-of-process on Linux/macOS). */ -void +bool sentry__process_crash(const sentry_options_t *options, sentry_crash_ipc_t *ipc) { SENTRY_DEBUG("Processing crash - START"); + bool crash_captured = false; sentry_crash_context_t *ctx = ipc->shmem; @@ -3119,11 +3120,14 @@ sentry__process_crash(const sentry_options_t *options, sentry_crash_ipc_t *ipc) if (options && options->transport && options->run) { SENTRY_DEBUG("Capturing crash envelope"); sentry__capture_envelope(options->transport, envelope, options); + crash_captured = true; SENTRY_DEBUG("Crash envelope captured (queued)"); } else { SENTRY_WARN("No transport available for sending envelope"); sentry_envelope_free(envelope); } + } else { + crash_captured = true; } // Clean up temporary envelope file (keep minidump for @@ -3194,6 +3198,7 @@ sentry__process_crash(const sentry_options_t *options, sentry_crash_ipc_t *ipc) done: SENTRY_DEBUG("Processing crash - END"); SENTRY_DEBUG("Crash processing complete"); + return crash_captured; } /** @@ -3481,9 +3486,21 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, long state = sentry__atomic_fetch(&ipc->shmem->state); if (state == SENTRY_CRASH_STATE_CRASHED && !crash_processed) { SENTRY_DEBUG("Crash notification received, processing"); - sentry__process_crash(options, ipc); + bool crash_captured = sentry__process_crash(options, ipc); crash_processed = true; + if (crash_captured + && ipc->shmem->crash_upload_mode + == SENTRY_CRASH_UPLOAD_MODE_ASYNC) { + // Crash data is durable after processing returns; + // remaining daemon work does not require the crashed + // process. + SENTRY_DEBUG( + "Crash captured, allowing app process to exit"); + sentry__atomic_store( + &ipc->shmem->state, SENTRY_CRASH_STATE_CAPTURED); + } + // After processing crash, exit regardless of parent state // (parent has likely already exited after re-raising signal) SENTRY_DEBUG("Crash processed, daemon exiting"); diff --git a/src/backends/native/sentry_crash_daemon.h b/src/backends/native/sentry_crash_daemon.h index 69c93e9a2..2ae3a3576 100644 --- a/src/backends/native/sentry_crash_daemon.h +++ b/src/backends/native/sentry_crash_daemon.h @@ -65,8 +65,9 @@ int sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, * * @param options Sentry options (DSN, transport, etc.) * @param ipc Crash IPC with crash context in shared memory + * @return true if the crash envelope was captured for upload or reporting */ -void sentry__process_crash( +bool sentry__process_crash( const struct sentry_options_s *options, sentry_crash_ipc_t *ipc); #endif diff --git a/src/backends/native/sentry_crash_handler.c b/src/backends/native/sentry_crash_handler.c index fe588d938..415f7575a 100644 --- a/src/backends/native/sentry_crash_handler.c +++ b/src/backends/native/sentry_crash_handler.c @@ -670,8 +670,8 @@ crash_signal_handler(int signum, siginfo_t *info, void *context) if (state == SENTRY_CRASH_STATE_PROCESSING && !processing_started) { // Daemon started processing (no logging - signal-safe) processing_started = true; - } else if (state == SENTRY_CRASH_STATE_DONE) { - // Daemon finished processing (no logging - signal-safe) + } else if (state >= SENTRY_CRASH_STATE_CAPTURED) { + // Daemon captured crash data (no logging - signal-safe) goto daemon_handling; } @@ -954,8 +954,8 @@ crash_exception_filter(EXCEPTION_POINTERS *exception_info) // Daemon started processing (no logging - exception filter // context) processing_started = true; - } else if (state == SENTRY_CRASH_STATE_DONE) { - // Daemon finished processing (no logging - exception filter + } else if (state >= SENTRY_CRASH_STATE_CAPTURED) { + // Daemon captured crash data (no logging - exception filter // context) break; } diff --git a/src/backends/native/sentry_wer.c b/src/backends/native/sentry_wer.c index 85a0b23d1..37542528c 100644 --- a/src/backends/native/sentry_wer.c +++ b/src/backends/native/sentry_wer.c @@ -156,7 +156,7 @@ process_wer_exception( waited_ms += SENTRY_CRASH_HANDLER_POLL_INTERVAL_MS) { if (InterlockedCompareExchange(&ctx->state, SENTRY_CRASH_STATE_DONE, SENTRY_CRASH_STATE_DONE) - == SENTRY_CRASH_STATE_DONE) { + >= SENTRY_CRASH_STATE_CAPTURED) { break; } Sleep(SENTRY_CRASH_HANDLER_POLL_INTERVAL_MS); diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index 5477dd2db..495bd0778 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -294,6 +294,7 @@ native_backend_startup( // Set crash reporting mode from options ctx->crash_reporting_mode = options->crash_reporting_mode; + ctx->crash_upload_mode = options->crash_upload_mode; // Pass debug logging setting to daemon ctx->debug_enabled = options->debug; diff --git a/src/sentry_options.c b/src/sentry_options.c index 830d23575..cb5bb936a 100644 --- a/src/sentry_options.c +++ b/src/sentry_options.c @@ -98,6 +98,7 @@ sentry_options_new(void) opts->crash_reporting_mode = SENTRY_CRASH_REPORTING_MODE_NATIVE_WITH_MINIDUMP; // Default: best of // both worlds + opts->crash_upload_mode = SENTRY_CRASH_UPLOAD_MODE_SYNC; opts->http_retry = false; opts->send_client_reports = true; opts->enable_large_attachments = false; @@ -616,6 +617,25 @@ sentry_options_get_crash_reporting_mode(const sentry_options_t *opts) return (sentry_crash_reporting_mode_t)opts->crash_reporting_mode; } +void +sentry_options_set_crash_upload_mode( + sentry_options_t *opts, sentry_crash_upload_mode_t mode) +{ + int imode = (int)mode; + if (imode < SENTRY_CRASH_UPLOAD_MODE_SYNC) { + imode = SENTRY_CRASH_UPLOAD_MODE_SYNC; + } else if (imode > SENTRY_CRASH_UPLOAD_MODE_ASYNC) { + imode = SENTRY_CRASH_UPLOAD_MODE_ASYNC; + } + opts->crash_upload_mode = imode; +} + +sentry_crash_upload_mode_t +sentry_options_get_crash_upload_mode(const sentry_options_t *opts) +{ + return (sentry_crash_upload_mode_t)opts->crash_upload_mode; +} + void sentry_options_set_crashpad_wait_for_upload( sentry_options_t *opts, int wait_for_upload) diff --git a/src/sentry_options.h b/src/sentry_options.h index bdd116830..6f64bba43 100644 --- a/src/sentry_options.h +++ b/src/sentry_options.h @@ -102,6 +102,7 @@ struct sentry_options_s { // sentry_crash_context.h) int crash_reporting_mode; // 0=minidump, 1=native, 2=native_with_minidump // (see sentry_crash_reporting_mode_t) + int crash_upload_mode; // 0=sync, 1=async (see sentry_crash_upload_mode_t) #ifdef SENTRY_PLATFORM_NX void (*network_connect_func)(void); diff --git a/tests/__init__.py b/tests/__init__.py index ecfd7f300..f709ecabe 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,6 +4,7 @@ import io import json import sys +import time import urllib import pytest import pprint @@ -24,6 +25,8 @@ def adb(*args, **kwargs): SENTRY_VERSION = "0.14.2" +from .assertions import wait_for_daemon as _wait_for_daemon + def make_dsn(httpserver, auth="uiaeosnrtdy", id=123456, proxy_host=False): url = urllib.parse.urlsplit(httpserver.url_for("/{}".format(id))) @@ -96,7 +99,13 @@ def extract_request(httpserver_log, cond): return (None, httpserver_log) -def run(cwd, exe, args, expect_failure=False, env=None, **kwargs): +def run( + cwd, exe, args, expect_failure=False, env=None, wait_for_daemon=False, **kwargs +): + if wait_for_daemon: + assert ( + "log" in args or exe != "sentry_example" + ), "sentry_example needs 'log' when waiting for the daemon" if env is None: env = dict(os.environ) if kwargs.get("check"): @@ -105,6 +114,7 @@ def run(cwd, exe, args, expect_failure=False, env=None, **kwargs): ) check = expect_failure == False __tracebackhide__ = True + started_at = time.time() if os.environ.get("ANDROID_API"): # older android emulators do not correctly pass down the returncode # so we basically echo the return code, and parse it manually @@ -179,6 +189,10 @@ def run(cwd, exe, args, expect_failure=False, env=None, **kwargs): ] try: result = subprocess.run([*cmd, *args], cwd=cwd, env=env, check=check, **kwargs) + if wait_for_daemon: + assert _wait_for_daemon( + cwd, started_at + ), "native crash daemon did not finish within timeout" if expect_failure: assert ( result.returncode != 0 diff --git a/tests/assertions.py b/tests/assertions.py index 7ec07e876..0f505d25f 100644 --- a/tests/assertions.py +++ b/tests/assertions.py @@ -13,7 +13,7 @@ import msgpack from . import SENTRY_VERSION -from .conditions import is_android +from .conditions import is_android, is_asan, is_tsan VERSION_RE = re.compile(r"(\d+\.\d+\.\d+)[-.]?(.*)") @@ -676,3 +676,31 @@ def wait_for_file(path, timeout=10.0, poll_interval=0.1): return True time.sleep(poll_interval) return False + + +def wait_for_daemon(tmp_path, started_at, timeout=None): + import time + + if timeout is None: + timeout = 30.0 if is_asan or is_tsan else 10.0 + + db_dir = Path(tmp_path) / ".sentry-native" + # Account for filesystems that truncate mtimes below time.time() precision. + started_at -= 1.0 + + deadline = time.time() + timeout + while time.time() < deadline: + for log_path in db_dir.glob("sentry-daemon-*.log"): + try: + if log_path.stat().st_mtime < started_at: + continue + log = log_path.read_text(errors="replace") + except OSError: + continue + + if "Marking crash state as DONE" in log: + return True + + time.sleep(0.1) + + return False diff --git a/tests/test_e2e_sentry.py b/tests/test_e2e_sentry.py index 782842ec1..4a3877b4a 100644 --- a/tests/test_e2e_sentry.py +++ b/tests/test_e2e_sentry.py @@ -352,7 +352,7 @@ def extract_test_id(output): raise ValueError(f"TEST_ID not found in output. Output was:\n{decoded[:500]}") -def run_crash_e2e(tmp_path, exe, args, env): +def run_crash_e2e(tmp_path, exe, args, env, wait_for_daemon=False): """ Run a crash test for E2E, capturing output for test ID extraction. @@ -370,7 +370,14 @@ def run_crash_e2e(tmp_path, exe, args, env): env["ASAN_OPTIONS"] = asan_signal_opts # Use check_output to capture stdout for test ID extraction - return check_output(tmp_path, exe, args, env=env, expect_failure=True) + return check_output( + tmp_path, + exe, + args, + env=env, + expect_failure=True, + wait_for_daemon=wait_for_daemon, + ) @pytest.mark.skipif( @@ -427,12 +434,15 @@ def run_crash_and_send(self, mode_args): # Run with crash - capture output for test ID # Enable structured logs and capture a log message before crashing crash_args = ["log", "e2e-test", "capture-log"] + mode_args + ["crash"] - output = run_crash_e2e(self.tmp_path, "sentry_example", crash_args, env=env) + output = run_crash_e2e( + self.tmp_path, + "sentry_example", + crash_args, + env=env, + wait_for_daemon=True, + ) test_id = extract_test_id(output) - # Wait for crash daemon to process - time.sleep(2) - # Print daemon logs for debugging (especially useful for Windows thread duplication investigation) self.print_daemon_logs() diff --git a/tests/test_integration_native.py b/tests/test_integration_native.py index d121101a4..138564cd1 100644 --- a/tests/test_integration_native.py +++ b/tests/test_integration_native.py @@ -39,7 +39,7 @@ SANITIZER_ARGS = ["shutdown-timeout", "10000"] if is_asan or is_tsan else [] -def run_crash(tmp_path, exe, args, env): +def run_crash(tmp_path, exe, args, env, wait_for_daemon=False): """ Run a crash test. @@ -62,7 +62,14 @@ def run_crash(tmp_path, exe, args, env): else: env["ASAN_OPTIONS"] = asan_signal_opts - run(tmp_path, exe, args, expect_failure=True, env=env) + run( + tmp_path, + exe, + args, + expect_failure=True, + env=env, + wait_for_daemon=wait_for_daemon, + ) def test_native_capture_crash(cmake, httpserver): @@ -1005,6 +1012,7 @@ def test_native_cache_keep(cmake, cache_keep, unreachable_dsn): "sentry_example", ["log", "stdout", "crash"] + (["cache-keep"] if cache_keep else []), env=env, + wait_for_daemon=not cache_keep, ) if cache_keep: @@ -1015,8 +1023,4 @@ def test_native_cache_keep(cmake, cache_keep, unreachable_dsn): assert len(dmp_files) == 1 assert cache_files[0].stem == dmp_files[0].stem else: - # Best-effort wait for crash processing to finish. 2s is not - # guaranteed to be enough, but we cannot poll for the non-existence - # of a file. - time.sleep(2) assert len(list(cache_dir.glob("*.envelope"))) == 0 diff --git a/tests/test_integration_tus.py b/tests/test_integration_tus.py index 601ede97e..81318f13b 100644 --- a/tests/test_integration_tus.py +++ b/tests/test_integration_tus.py @@ -525,9 +525,15 @@ def test_tus_crash_native(cmake, httpserver): run( tmp_path, "sentry_example", - ["log", "large-attachment", "crash"], + [ + "log", + "large-attachment", + "async-crash-upload", + "crash", + ], expect_failure=True, env=env, + wait_for_daemon=True, ) assert waiting.result diff --git a/tests/unit/test_native_backend.c b/tests/unit/test_native_backend.c index 661175574..6b5b196ce 100644 --- a/tests/unit/test_native_backend.c +++ b/tests/unit/test_native_backend.c @@ -367,6 +367,9 @@ SENTRY_TEST(crash_context_transport_fields) ctx->shutdown_timeout = 12345; TEST_CHECK_UINT64_EQUAL(ctx->shutdown_timeout, 12345); + ctx->crash_upload_mode = SENTRY_CRASH_UPLOAD_MODE_ASYNC; + TEST_CHECK_INT_EQUAL( + ctx->crash_upload_mode, SENTRY_CRASH_UPLOAD_MODE_ASYNC); ctx->transfer_timeout = 45000; TEST_CHECK_UINT64_EQUAL(ctx->transfer_timeout, 45000); @@ -376,6 +379,7 @@ SENTRY_TEST(crash_context_transport_fields) TEST_CHECK(ctx->proxy[0] == '\0'); TEST_CHECK(ctx->user_agent[0] == '\0'); TEST_CHECK_UINT64_EQUAL(ctx->shutdown_timeout, 0); + TEST_CHECK_INT_EQUAL(ctx->crash_upload_mode, SENTRY_CRASH_UPLOAD_MODE_SYNC); TEST_CHECK_UINT64_EQUAL(ctx->transfer_timeout, 0); sentry_free(ctx); @@ -398,6 +402,8 @@ SENTRY_TEST(crash_context_options_propagation) sentry_options_set_ca_certs(options, "/path/to/ca-bundle.crt"); sentry_options_set_proxy(options, "http://myproxy:3128"); sentry_options_set_shutdown_timeout(options, 12345); + sentry_options_set_crash_upload_mode( + options, SENTRY_CRASH_UPLOAD_MODE_ASYNC); sentry_options_set_transfer_timeout(options, 45000); // Verify options were set correctly @@ -426,6 +432,7 @@ SENTRY_TEST(crash_context_options_propagation) ctx->user_agent[sizeof(ctx->user_agent) - 1] = '\0'; } ctx->shutdown_timeout = options->shutdown_timeout; + ctx->crash_upload_mode = options->crash_upload_mode; ctx->transfer_timeout = options->transfer_timeout; // Verify crash context received the values @@ -434,6 +441,8 @@ SENTRY_TEST(crash_context_options_propagation) // user_agent should have the default SDK user agent TEST_CHECK(ctx->user_agent[0] != '\0'); TEST_CHECK_UINT64_EQUAL(ctx->shutdown_timeout, 12345); + TEST_CHECK_INT_EQUAL( + ctx->crash_upload_mode, SENTRY_CRASH_UPLOAD_MODE_ASYNC); TEST_CHECK_UINT64_EQUAL(ctx->transfer_timeout, 45000); sentry_free(ctx);