From a80aa17a0182a2d9ac8c93e8bbcff649459b6821 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Fri, 15 May 2026 15:38:14 +0800 Subject: [PATCH] Centralize sysroot path translation and case-fold Move guest-to-host path resolution into a single entry point in src/syscall/path.{c,h}. path_translate_at honors three modes (no-follow, follow, create-with-optional-parents), preserves the resolver errno on failure so callers translate it via linux_errno() instead of flattening to ENAMETOOLONG, and rejects ".." in the basename when follow_final is false so an lstat cannot escape above the sysroot. Factor --sysroot/--create-sysroot provisioning out of main.c into src/core/sysroot.{c,h}. Validate caller-supplied sysroot length before any heap allocation, treat collision-sentinel truncation as a hard validation failure rather than failing open, and set errno on every parse path so the cleanup logger reports a real reason. Add a case-fold sidecar at src/syscall/sidecar.{c,h} for case-insensitive macOS volumes. The sidecar keeps colliding Linux guest names distinct by mapping each to a hidden token file plus a per-directory index, so that guest workloads relying on case-sensitive Linux path semantics still work on the host's case-insensitive APFS or HFS+. Procemu-virtual paths (/proc, /sys, /dev) short-circuit the sidecar walk after normalization so they reach the procemu intercept intact instead of failing with ENOENT against a directory that does not exist in the sysroot. Fix /proc/self/exe sysroot prefix strip: proc_set_sysroot stores the realpath canonical form, so the readlink handler now canonicalizes the stored elf_path before the prefix check, otherwise macOS symlinks such as /var -> /private/var make the strncmp diverge and leak the host path back to the guest. Serialize sysroot_casefold across fork IPC so child processes keep the sidecar feature after clone/fork. Lock elf_path against torn reads from sibling vCPUs during execve and expose proc_elf_path_snapshot for content-consuming callers; proc_get_elf_path keeps the legacy boolean-test contract. --- Makefile | 2 + mk/tests.mk | 38 +- src/core/sysroot.c | 561 +++++++++ src/core/sysroot.h | 28 + src/main.c | 123 +- src/runtime/fork-state.c | 17 +- src/runtime/procemu.c | 37 +- src/syscall/exec.c | 25 +- src/syscall/fs-stat.c | 59 +- src/syscall/fs-xattr.c | 56 +- src/syscall/fs.c | 527 +++----- src/syscall/path.c | 108 +- src/syscall/path.h | 27 +- src/syscall/proc-state.c | 181 ++- src/syscall/proc.h | 16 +- src/syscall/sidecar.c | 1880 +++++++++++++++++++++++++++++ src/syscall/sidecar.h | 48 + tests/test-case-collision.c | 482 ++++++++ tests/test-sysroot-create-paths.c | 256 ++++ 19 files changed, 3996 insertions(+), 475 deletions(-) create mode 100644 src/core/sysroot.c create mode 100644 src/core/sysroot.h create mode 100644 src/syscall/sidecar.c create mode 100644 src/syscall/sidecar.h create mode 100644 tests/test-case-collision.c create mode 100644 tests/test-sysroot-create-paths.c diff --git a/Makefile b/Makefile index 6b7bee6..45a921f 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ SRCS := \ core/stack.c \ core/vdso.c \ core/bootstrap.c \ + core/sysroot.c \ runtime/thread.c \ runtime/futex.c \ runtime/forkipc.c \ @@ -35,6 +36,7 @@ SRCS := \ syscall/translate.c \ syscall/mem.c \ syscall/path.c \ + syscall/sidecar.c \ syscall/fs.c \ syscall/fs-stat.c \ syscall/fs-xattr.c \ diff --git a/mk/tests.mk b/mk/tests.mk index b7c60c3..844b16c 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -6,6 +6,7 @@ test-glibc-coreutils test-perf \ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ test-full test-multi-vcpu test-rwx test-sysroot-rename \ + test-case-collision test-case-collision-fallback test-sysroot-create-paths \ test-proctitle-low-stack \ test-sysroot-procfs-exec test-timeout-disable \ test-sysroot-nofollow test-sysroot-chdir perf @@ -38,8 +39,8 @@ test-sysroot-rename: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-rename printf 'inside-sysroot\n' > "$$tmpdir/tmp/elfuse-sysroot-rename-src.txt"; \ rm -f /tmp/elfuse-sysroot-rename-dst.txt; \ $(ELFUSE_BIN) --sysroot "$$tmpdir" $(BUILD_DIR)/test-sysroot-rename; \ - if [ ! -f "$$tmpdir/tmp/elfuse-sysroot-rename-dst.txt" ]; then \ - printf "$(RED)FAIL$(RESET) rename did not create destination in sysroot\n"; \ + if [ -f "$$tmpdir/tmp/elfuse-sysroot-rename-src.txt" ]; then \ + printf "$(RED)FAIL$(RESET) rename did not remove source from sysroot\n"; \ exit 1; \ fi; \ if [ -e /tmp/elfuse-sysroot-rename-dst.txt ]; then \ @@ -60,6 +61,39 @@ test-sysroot-chdir: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-chdir mkdir -p "$$tmpdir/bin" "$$tmpdir/lib" "$$tmpdir/lib/elfuse-sysroot-shadow"; \ $(ELFUSE_BIN) --sysroot "$$tmpdir" $(BUILD_DIR)/test-sysroot-chdir +test-case-collision: $(ELFUSE_BIN) $(BUILD_DIR)/test-case-collision + @tmpdir=$$(mktemp -d); \ + trap 'rm -rf "$$tmpdir"' EXIT; \ + $(ELFUSE_BIN) --create-sysroot "$$tmpdir/case-sysroot" $(BUILD_DIR)/test-case-collision + +test-case-collision-fallback: $(ELFUSE_BIN) $(BUILD_DIR)/test-case-collision + @tmpdir=$$(mktemp -d); \ + trap 'rm -rf "$$tmpdir"' EXIT; \ + $(ELFUSE_BIN) --sysroot "$$tmpdir" $(BUILD_DIR)/test-case-collision + +test-sysroot-create-paths: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-create-paths + @tmpdir=$$(mktemp -d); \ + guest_tmp="/tmp/elfuse-sysroot-create-paths/file.txt"; \ + mounted_tmp="$$tmpdir/case-sysroot/tmp/elfuse-sysroot-create-paths/file.txt"; \ + host_out_dir="$$tmpdir/host-out"; \ + host_out="$$host_out_dir/result.txt"; \ + trap 'rm -rf "$$tmpdir"; rm -rf /tmp/elfuse-sysroot-create-paths' EXIT; \ + rm -rf /tmp/elfuse-sysroot-create-paths; \ + mkdir -p "$$host_out_dir"; \ + $(ELFUSE_BIN) --create-sysroot "$$tmpdir/case-sysroot" $(BUILD_DIR)/test-sysroot-create-paths "$$guest_tmp" "$$mounted_tmp" "$$host_out" "$$tmpdir/case-sysroot"; \ + if [ -e "$$guest_tmp" ]; then \ + printf "$(RED)FAIL$(RESET) guest /tmp escaped to host /tmp\n"; \ + exit 1; \ + fi; \ + if [ ! -f "$$host_out" ]; then \ + printf "$(RED)FAIL$(RESET) host fallback path was not created\n"; \ + exit 1; \ + fi; \ + if ! grep -q "host-fallback" "$$host_out"; then \ + printf "$(RED)FAIL$(RESET) host fallback file contents mismatch\n"; \ + exit 1; \ + fi + test-sysroot-procfs-exec: $(ELFUSE_BIN) $(BUILD_DIR)/test-procfs-exec @tmpdir=$$(mktemp -d); \ trap 'rm -rf "$$tmpdir"' EXIT; \ diff --git a/src/core/sysroot.c b/src/core/sysroot.c new file mode 100644 index 0000000..188c783 --- /dev/null +++ b/src/core/sysroot.c @@ -0,0 +1,561 @@ +/* Sysroot capability probing and provisioning + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debug/log.h" +#include "utils.h" + +#include "core/sysroot.h" + +extern char **environ; + +#define CASE_COLLISION_SENTINEL_A "net/netfilter/xt_CONNMARK.h" +#define CASE_COLLISION_SENTINEL_B "net/netfilter/xt_connmark.h" + +typedef struct { + uint32_t length; + vol_capabilities_attr_t caps; +} volume_caps_buf_t; + +static int ensure_dir_exists_follow(const char *path) +{ + if (!path || path[0] == '\0') { + errno = EINVAL; + return -1; + } + + char buf[LINUX_PATH_MAX]; + size_t len = str_copy_trunc(buf, path, sizeof(buf)); + if (len >= sizeof(buf)) { + errno = ENAMETOOLONG; + return -1; + } + + for (char *p = buf + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return -1; + *p = '/'; + } + + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return -1; + return 0; +} + +int sysroot_ensure_dir_exists(const char *path) +{ + if (!path || path[0] == '\0') { + errno = EINVAL; + return -1; + } + + char buf[LINUX_PATH_MAX]; + size_t len = str_copy_trunc(buf, path, sizeof(buf)); + if (len >= sizeof(buf)) { + errno = ENAMETOOLONG; + return -1; + } + + for (char *p = buf + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + if (mkdir(buf, 0755) < 0) { + if (errno != EEXIST) + return -1; + + struct stat st; + if (lstat(buf, &st) < 0) + return -1; + if (S_ISLNK(st.st_mode)) { + errno = ELOOP; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + } + *p = '/'; + } + + if (mkdir(buf, 0755) < 0) { + if (errno != EEXIST) + return -1; + + struct stat st; + if (lstat(buf, &st) < 0) + return -1; + if (S_ISLNK(st.st_mode)) { + errno = ELOOP; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + } + return 0; +} + +int sysroot_validate_dir_prefix(const char *path) +{ + if (!path || path[0] == '\0') { + errno = EINVAL; + return -1; + } + + char buf[LINUX_PATH_MAX]; + size_t len = str_copy_trunc(buf, path, sizeof(buf)); + if (len >= sizeof(buf)) { + errno = ENAMETOOLONG; + return -1; + } + + for (char *p = buf + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + + struct stat st; + if (lstat(buf, &st) < 0) { + if (errno == ENOENT) { + *p = '/'; + return 0; + } + return -1; + } + if (S_ISLNK(st.st_mode)) { + errno = ELOOP; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + *p = '/'; + } + + struct stat st; + if (lstat(buf, &st) < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + if (S_ISLNK(st.st_mode)) { + errno = ELOOP; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + + return 0; +} + +static int spawn_capture_stdout(char *const argv[], + char *buf, + size_t bufsz, + int *status_out) +{ + int pipefd[2] = {-1, -1}; + if (pipe(pipefd) < 0) + return -1; + + posix_spawn_file_actions_t actions; + posix_spawn_file_actions_init(&actions); + posix_spawn_file_actions_adddup2(&actions, pipefd[1], STDOUT_FILENO); + posix_spawn_file_actions_addclose(&actions, pipefd[0]); + posix_spawn_file_actions_addclose(&actions, pipefd[1]); + + pid_t pid = -1; + int spawn_ret = posix_spawnp(&pid, argv[0], &actions, NULL, argv, environ); + posix_spawn_file_actions_destroy(&actions); + close(pipefd[1]); + if (spawn_ret != 0) { + close(pipefd[0]); + errno = spawn_ret; + return -1; + } + + size_t off = 0; + while (buf && off + 1 < bufsz) { + ssize_t n = read(pipefd[0], buf + off, bufsz - off - 1); + if (n < 0) { + if (errno == EINTR) + continue; + break; + } + if (n == 0) + break; + off += (size_t) n; + } + if (buf && bufsz > 0) + buf[off] = '\0'; + close(pipefd[0]); + + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) + return -1; + } + if (status_out) + *status_out = status; + return 0; +} + +static int spawn_simple(char *const argv[]) +{ + pid_t pid = -1; + int spawn_ret = posix_spawnp(&pid, argv[0], NULL, NULL, argv, environ); + if (spawn_ret != 0) { + errno = spawn_ret; + return -1; + } + + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) + return -1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + errno = EIO; + return -1; + } + return 0; +} + +static int parse_attach_mountpoint(const char *plist, + char *mount_path, + size_t mount_path_sz) +{ + static const char *key = "mount-point"; + static const char *open_tag = ""; + static const char *close_tag = ""; + + const char *p = strstr(plist, key); + if (!p) { + errno = EPROTO; + return -1; + } + p = strstr(p, open_tag); + if (!p) { + errno = EPROTO; + return -1; + } + p += strlen(open_tag); + const char *end = strstr(p, close_tag); + if (!end) { + errno = EPROTO; + return -1; + } + + size_t len = (size_t) (end - p); + if (len + 1 > mount_path_sz) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(mount_path, p, len); + mount_path[len] = '\0'; + return 0; +} + +static int probe_case_sensitivity_getattrlist(const char *path, + bool *case_sensitive, + bool *case_preserving) +{ + struct attrlist attrs; + memset(&attrs, 0, sizeof(attrs)); + attrs.bitmapcount = ATTR_BIT_MAP_COUNT; + attrs.volattr = ATTR_VOL_CAPABILITIES; + + volume_caps_buf_t caps; + memset(&caps, 0, sizeof(caps)); + if (getattrlist(path, &attrs, &caps, sizeof(caps), 0) < 0) + return -1; + + uint32_t valid_format = caps.caps.valid[VOL_CAPABILITIES_FORMAT]; + uint32_t caps_format = caps.caps.capabilities[VOL_CAPABILITIES_FORMAT]; + if ((valid_format & VOL_CAP_FMT_CASE_SENSITIVE) == 0 || + (valid_format & VOL_CAP_FMT_CASE_PRESERVING) == 0) { + errno = ENOTSUP; + return -1; + } + + *case_sensitive = (caps_format & VOL_CAP_FMT_CASE_SENSITIVE) != 0; + *case_preserving = (caps_format & VOL_CAP_FMT_CASE_PRESERVING) != 0; + return 0; +} + +static int probe_case_sensitivity_pathconf(const char *path, + bool *case_sensitive, + bool *case_preserving) +{ + errno = 0; + long sensitive = pathconf(path, _PC_CASE_SENSITIVE); + if (sensitive < 0 && errno != 0) + return -1; + + errno = 0; + long preserving = pathconf(path, _PC_CASE_PRESERVING); + if (preserving < 0 && errno != 0) + return -1; + + if (sensitive < 0 || preserving < 0) { + errno = ENOTSUP; + return -1; + } + + *case_sensitive = sensitive != 0; + *case_preserving = preserving != 0; + return 0; +} + +int sysroot_probe_case_sensitivity(const char *path, + bool *case_sensitive, + bool *case_preserving) +{ + if (probe_case_sensitivity_pathconf(path, case_sensitive, + case_preserving) == 0) { + return 0; + } + return probe_case_sensitivity_getattrlist(path, case_sensitive, + case_preserving); +} + +/* Returns 1 if both sentinel paths exist under sysroot, 0 if at least one + * is absent, -1 on error (e.g. path truncation). Truncation must fail + * closed: returning 0 would silently admit a case-insensitive sysroot + * that should be rejected for colliding Linux pathnames. + */ +static int collision_pair_exists(const char *sysroot, + const char *rel_a, + const char *rel_b) +{ + char a[LINUX_PATH_MAX]; + char b[LINUX_PATH_MAX]; + + if (snprintf(a, sizeof(a), "%s/%s", sysroot, rel_a) >= (int) sizeof(a) || + snprintf(b, sizeof(b), "%s/%s", sysroot, rel_b) >= (int) sizeof(b)) { + errno = ENAMETOOLONG; + return -1; + } + + struct stat st_a; + struct stat st_b; + return (lstat(a, &st_a) == 0 && lstat(b, &st_b) == 0) ? 1 : 0; +} + +int sysroot_validate_case_sensitivity(const char *path) +{ + if (!path || path[0] == '\0') + return 0; + + bool case_sensitive = false; + bool case_preserving = false; + if (sysroot_probe_case_sensitivity(path, &case_sensitive, + &case_preserving) < 0) { + log_warn("sysroot: could not probe case sensitivity for %s: %s", path, + strerror(errno)); + return 0; + } + + if (!case_preserving) { + log_error( + "sysroot %s is not case-preserving; guest pathnames cannot " + "round-trip safely on this volume", + path); + return -1; + } + + if (case_sensitive) + return 0; + + int collide = collision_pair_exists(path, CASE_COLLISION_SENTINEL_A, + CASE_COLLISION_SENTINEL_B); + if (collide < 0) { + log_error("sysroot %s: cannot probe case-collision sentinels: %s", path, + strerror(errno)); + return -1; + } + if (collide > 0) { + log_error( + "sysroot %s is case-insensitive and contains colliding Linux " + "kernel paths (%s and %s); refuse to run this workload on a " + "case-insensitive volume", + path, CASE_COLLISION_SENTINEL_A, CASE_COLLISION_SENTINEL_B); + return -1; + } + + log_warn( + "sysroot %s is case-insensitive; workloads with colliding guest " + "names such as Linux kernel trees may fail. Use --create-sysroot " + "to run inside a case-sensitive APFS sparsebundle.", + path); + return 0; +} + +static int sysroot_detach_mountpoint_force(const char *mount_path, bool force) +{ + if (force) { + char *const argv[] = {"hdiutil", "detach", "-force", + (char *) mount_path, NULL}; + return spawn_simple(argv); + } + + char *const argv[] = {"hdiutil", "detach", (char *) mount_path, NULL}; + return spawn_simple(argv); +} + +static bool sysroot_mountpoint_is_active(const char *mount_path) +{ + if (!mount_path || mount_path[0] == '\0') + return false; + + struct statfs *mntbuf = NULL; + int count = getmntinfo(&mntbuf, MNT_NOWAIT); + if (count <= 0 || !mntbuf) + return false; + + for (int i = 0; i < count; i++) { + if (!strcmp(mount_path, mntbuf[i].f_mntonname)) + return true; + } + + return false; +} + +static int write_spotlight_marker(const char *mount_path) +{ + char marker[LINUX_PATH_MAX]; + if (snprintf(marker, sizeof(marker), "%s/.metadata_never_index", + mount_path) >= (int) sizeof(marker)) { + errno = ENAMETOOLONG; + return -1; + } + + int fd = open(marker, O_WRONLY | O_CREAT | O_CLOEXEC, 0644); + if (fd < 0) + return -1; + close(fd); + return 0; +} + +int sysroot_create_mount(const char *mount_path, sysroot_mount_t *mount) +{ + if (!mount || !mount_path || mount_path[0] == '\0') { + errno = EINVAL; + return -1; + } + + memset(mount, 0, sizeof(*mount)); + + if (strlen(mount_path) + strlen(".sparsebundle") >= + sizeof(mount->image_path)) { + errno = ENAMETOOLONG; + return -1; + } + str_copy_trunc(mount->mount_path, mount_path, sizeof(mount->mount_path)); + snprintf(mount->image_path, sizeof(mount->image_path), "%s.sparsebundle", + mount_path); + + if (ensure_dir_exists_follow(mount_path) < 0) + return -1; + + if (sysroot_mountpoint_is_active(mount_path) && + sysroot_detach_mountpoint_force(mount_path, false) < 0 && + errno != EIO && errno != ENOENT) { + log_warn("sysroot: stale detach of %s failed: %s", mount_path, + strerror(errno)); + } + + struct stat st; + if (lstat(mount->image_path, &st) < 0) { + if (errno != ENOENT) + return -1; + + char *const create_argv[] = {"hdiutil", + "create", + "-fs", + "Case-sensitive APFS", + "-size", + "16g", + "-type", + "SPARSEBUNDLE", + "-volname", + "elfuse_sysroot", + mount->image_path, + NULL}; + if (spawn_simple(create_argv) < 0) { + log_error("sysroot: hdiutil create failed for %s: %s", + mount->image_path, strerror(errno)); + return -1; + } + } + + char plist[32768]; + int status = 0; + char *const attach_argv[] = { + "hdiutil", "attach", "-mountpoint", mount->mount_path, + "-plist", mount->image_path, NULL}; + if (spawn_capture_stdout(attach_argv, plist, sizeof(plist), &status) < 0 || + !WIFEXITED(status) || WEXITSTATUS(status) != 0) { + log_error("sysroot: hdiutil attach failed for %s", mount->image_path); + return -1; + } + + char attached_mount[LINUX_PATH_MAX]; + if (parse_attach_mountpoint(plist, attached_mount, sizeof(attached_mount)) < + 0) { + log_error( + "sysroot: could not parse mount point from hdiutil attach " + "plist for %s", + mount->image_path); + sysroot_detach_mountpoint_force(mount->mount_path, true); + return -1; + } + + str_copy_trunc(mount->mount_path, attached_mount, + sizeof(mount->mount_path)); + mount->active = true; + mount->detach_on_cleanup = true; + + if (write_spotlight_marker(mount->mount_path) < 0) { + log_warn("sysroot: failed to create Spotlight marker in %s: %s", + mount->mount_path, strerror(errno)); + } + + return 0; +} + +void sysroot_cleanup_mount(sysroot_mount_t *mount) +{ + if (!mount || !mount->active || !mount->detach_on_cleanup) + return; + + if (sysroot_detach_mountpoint_force(mount->mount_path, true) < 0) { + log_warn("sysroot: detach %s failed: %s", mount->mount_path, + strerror(errno)); + } + + mount->active = false; +} diff --git a/src/core/sysroot.h b/src/core/sysroot.h new file mode 100644 index 0000000..3fe83c2 --- /dev/null +++ b/src/core/sysroot.h @@ -0,0 +1,28 @@ +/* Sysroot capability probing and provisioning + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include "syscall/internal.h" + +typedef struct { + bool active; + bool detach_on_cleanup; + char mount_path[LINUX_PATH_MAX]; + char image_path[LINUX_PATH_MAX]; +} sysroot_mount_t; + +int sysroot_validate_case_sensitivity(const char *path); +int sysroot_probe_case_sensitivity(const char *path, + bool *case_sensitive, + bool *case_preserving); +int sysroot_create_mount(const char *mount_path, sysroot_mount_t *mount); +void sysroot_cleanup_mount(sysroot_mount_t *mount); +int sysroot_ensure_dir_exists(const char *path); +int sysroot_validate_dir_prefix(const char *path); diff --git a/src/main.c b/src/main.c index f82ca45..cebf591 100644 --- a/src/main.c +++ b/src/main.c @@ -25,8 +25,11 @@ #include #include +#include "utils.h" + #include "core/bootstrap.h" #include "core/guest.h" +#include "core/sysroot.h" #include "runtime/forkipc.h" #include "runtime/proctitle.h" @@ -58,13 +61,18 @@ static void free_guest_argv(const char **guest_argv, int guest_argc) static void cleanup_main_resources(guest_t *g, bool guest_initialized, + sysroot_mount_t *sysroot_mount, + const char *host_cwd, const char **guest_argv, int guest_argc, - const char *elf_path, - const char *sysroot_path) + char *elf_path, + char *sysroot_path) { if (guest_initialized) guest_destroy(g); + if (host_cwd && host_cwd[0] != '\0' && chdir(host_cwd) < 0) + (void) chdir("/"); + sysroot_cleanup_mount(sysroot_mount); free_guest_argv(guest_argv, guest_argc); free((void *) elf_path); free((void *) sysroot_path); @@ -114,6 +122,7 @@ int main(int argc, char **argv) bool verbose = false; int timeout_sec = 10, fork_child_fd = -1, vfork_notify_fd = -1; const char *sysroot = NULL; + const char *create_sysroot = NULL; int gdb_port = 0; bool gdb_stop_on_entry = false; int arg_start = 1; @@ -127,6 +136,7 @@ int main(int argc, char **argv) if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) { printf( "usage: elfuse [--verbose] [--timeout N] [--sysroot PATH]\n" + " [--create-sysroot PATH]\n" " [--gdb PORT] [--gdb-stop-on-entry]\n" " [args...]\n" "\n" @@ -138,6 +148,8 @@ int main(int argc, char **argv) "(seconds, default 10; 0 disables)\n" " --sysroot PATH Resolve absolute guest paths under " "PATH first\n" + " --create-sysroot PATH Provision and use a case-sensitive " + "APFS sparsebundle mounted at PATH\n" " --gdb PORT Listen for GDB Remote Serial " "Protocol on PORT\n" " --gdb-stop-on-entry Halt before the first guest " @@ -183,6 +195,10 @@ int main(int argc, char **argv) arg_start + 1 < argc) { sysroot = argv[arg_start + 1]; arg_start += 2; + } else if (!strcmp(argv[arg_start], "--create-sysroot") && + arg_start + 1 < argc) { + create_sysroot = argv[arg_start + 1]; + arg_start += 2; } else if (!strcmp(argv[arg_start], "--gdb") && arg_start + 1 < argc) { if (parse_int_arg(argv[arg_start + 1], 1, 65535, &gdb_port) < 0) { log_error("invalid GDB port: %s", argv[arg_start + 1]); @@ -201,12 +217,18 @@ int main(int argc, char **argv) log_error("unknown option: %s", argv[arg_start]); log_error( "usage: elfuse [--verbose] [--timeout N] " - "[--sysroot PATH] [--gdb PORT] " + "[--sysroot PATH] [--create-sysroot PATH] [--gdb PORT] " "[--gdb-stop-on-entry] [args...]"); return 1; } } + if (sysroot && create_sysroot) { + log_error( + "use either --sysroot PATH or --create-sysroot PATH, not both"); + return 1; + } + /* Fork-child mode: receive VM state over IPC and run */ if (fork_child_fd >= 0) return fork_child_main(fork_child_fd, vfork_notify_fd, verbose, @@ -215,7 +237,7 @@ int main(int argc, char **argv) if (arg_start >= argc) { log_error( "usage: elfuse [--verbose] [--timeout N] " - "[--sysroot PATH] [args...]"); + "[--sysroot PATH] [--create-sysroot PATH] [args...]"); return 1; } @@ -223,29 +245,79 @@ int main(int argc, char **argv) * data lives in a contiguous stack region that elfuse clobbers below for * the process title (PostgreSQL/nginx argv-clobber technique). */ - const char *elf_path = strdup(argv[arg_start]); - bool have_sysroot = (sysroot != NULL); - const char *sysroot_path = have_sysroot ? strdup(sysroot) : NULL; + char *elf_path = strdup(argv[arg_start]); + bool have_sysroot = (sysroot != NULL || create_sysroot != NULL); + const char *sysroot_src = create_sysroot ? create_sysroot : sysroot; + char *sysroot_path = NULL; + if (have_sysroot) { + sysroot_path = (char *) calloc(LINUX_PATH_MAX, 1); + if (sysroot_path) { + size_t src_len = + str_copy_trunc(sysroot_path, sysroot_src, LINUX_PATH_MAX); + if (src_len >= LINUX_PATH_MAX) { + log_error("sysroot path too long (%zu bytes, max %d): %s", + src_len, LINUX_PATH_MAX - 1, sysroot_src); + free(elf_path); + free(sysroot_path); + return 1; + } + } + } sysroot = sysroot_path; int guest_argc = argc - arg_start; const char **guest_argv = (const char **) calloc((size_t) guest_argc, sizeof(char *)); guest_t g; bool guest_initialized = false; + sysroot_mount_t sysroot_mount; + char host_cwd[LINUX_PATH_MAX]; + bool have_host_cwd = (getcwd(host_cwd, sizeof(host_cwd)) != NULL); + memset(&sysroot_mount, 0, sizeof(sysroot_mount)); if (!elf_path || (have_sysroot && !sysroot_path) || !guest_argv) { log_error("out of memory"); - cleanup_main_resources(&g, guest_initialized, guest_argv, guest_argc, - elf_path, sysroot_path); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); return 1; } for (int i = 0; i < guest_argc; i++) { guest_argv[i] = strdup(argv[arg_start + i]); if (!guest_argv[i]) { log_error("out of memory"); - cleanup_main_resources(&g, guest_initialized, guest_argv, + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + return 1; + } + } + + if (create_sysroot) { + if (sysroot_create_mount(sysroot_path, &sysroot_mount) < 0) { + log_error("failed to provision case-sensitive sysroot at %s: %s", + sysroot_path, strerror(errno)); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, guest_argc, elf_path, sysroot_path); return 1; } + size_t mounted_len = str_copy_trunc( + sysroot_path, sysroot_mount.mount_path, LINUX_PATH_MAX); + if (mounted_len >= LINUX_PATH_MAX) { + log_error("mounted sysroot path too long: %s", + sysroot_mount.mount_path); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + return 1; + } + sysroot = sysroot_path; + } + + if (have_sysroot && sysroot_validate_case_sensitivity(sysroot) < 0) { + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + return 1; } guest_bootstrap_t boot; @@ -254,18 +326,33 @@ int main(int argc, char **argv) if (guest_bootstrap_prepare(&g, elf_path, sysroot, guest_argc, guest_argv, environ, shim_bin, shim_bin_len, verbose, &guest_initialized, &boot) < 0) { - cleanup_main_resources(&g, guest_initialized, guest_argv, guest_argc, - elf_path, sysroot_path); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); return 1; } + if (have_sysroot) { + bool case_sensitive = true; + bool case_preserving = true; + if (sysroot_probe_case_sensitivity(sysroot, &case_sensitive, + &case_preserving) == 0) { + proc_set_sysroot_casefold(case_preserving && !case_sensitive); + } else { + proc_set_sysroot_casefold(false); + } + } else { + proc_set_sysroot_casefold(false); + } + runtime_set_process_title(argc, argv, elf_path); hv_vcpu_t vcpu; hv_vcpu_exit_t *vexit; if (guest_bootstrap_create_vcpu(&g, &boot, verbose, &vcpu, &vexit) < 0) { - cleanup_main_resources(&g, guest_initialized, guest_argv, guest_argc, - elf_path, sysroot_path); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); return 1; } @@ -275,7 +362,8 @@ int main(int argc, char **argv) if (gdb_port > 0) { if (gdb_stub_init(gdb_port, &g) < 0) { log_error("failed to initialize GDB stub"); - cleanup_main_resources(&g, guest_initialized, guest_argv, + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, guest_argc, elf_path, sysroot_path); return 1; } @@ -292,8 +380,9 @@ int main(int argc, char **argv) /* Tear down debugger state before freeing guest/vCPU resources. */ gdb_stub_shutdown(); - cleanup_main_resources(&g, guest_initialized, guest_argv, guest_argc, - elf_path, sysroot_path); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); return exit_code; } diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index c168f8f..ed50f16 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -441,13 +441,14 @@ int fork_ipc_send_process_state(int ipc_sock, char sysroot_ipc[LINUX_PATH_MAX] = {0}; (void) proc_sysroot_snapshot(sysroot_ipc, sizeof(sysroot_ipc)); - if (fork_ipc_write_all(ipc_sock, sysroot_ipc, sizeof(sysroot_ipc)) < 0) + uint8_t sysroot_casefold_ipc = proc_sysroot_casefold_enabled() ? 1 : 0; + if (fork_ipc_write_all(ipc_sock, sysroot_ipc, sizeof(sysroot_ipc)) < 0 || + fork_ipc_write_all(ipc_sock, &sysroot_casefold_ipc, + sizeof(sysroot_casefold_ipc)) < 0) return -1; char elf_path_ipc[LINUX_PATH_MAX] = {0}; - const char *ep = proc_get_elf_path(); - if (ep) - str_copy_trunc(elf_path_ipc, ep, sizeof(elf_path_ipc)); + (void) proc_elf_path_snapshot(elf_path_ipc, sizeof(elf_path_ipc)); char elfuse_path_ipc[LINUX_PATH_MAX] = {0}; const char *hp = proc_get_elfuse_path(); if (hp) @@ -585,6 +586,14 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) if (sysroot_ipc[0] != '\0') proc_set_sysroot(sysroot_ipc); + uint8_t sysroot_casefold_ipc = 0; + if (fork_ipc_read_all(ipc_fd, &sysroot_casefold_ipc, + sizeof(sysroot_casefold_ipc)) < 0) { + log_error("fork-child: failed to read sysroot casefold"); + return -1; + } + proc_set_sysroot_casefold(sysroot_casefold_ipc != 0); + char elf_path_ipc[LINUX_PATH_MAX]; if (fork_ipc_read_all(ipc_fd, elf_path_ipc, sizeof(elf_path_ipc)) < 0) { log_error("fork-child: failed to read elf path"); diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index c5097d2..852f475 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -547,11 +547,16 @@ static int proc_emit_literal(const char *s) */ static const char *proc_comm_name(void) { - const char *exe = proc_get_elf_path(); - if (!exe) + /* Snapshot into a thread-local buffer so a concurrent execve cannot + * tear the shared elf_path under the basename scan. The TLS lifetime + * matches the calling thread, which is what callers (printf-style + * formatters) require. + */ + static _Thread_local char exe_tls[LINUX_PATH_MAX]; + if (!proc_elf_path_snapshot(exe_tls, sizeof(exe_tls))) return "elfuse"; - const char *slash = strrchr(exe, '/'); - return slash ? slash + 1 : exe; + const char *slash = strrchr(exe_tls, '/'); + return slash ? slash + 1 : exe_tls; } /* Parse the numeric tail of a /proc/.../ or /dev/fd/ path. @@ -1556,8 +1561,8 @@ int proc_intercept_open(const guest_t *g, * return an actual file descriptor to the binary. */ if (!strcmp(path, "/proc/self/exe")) { - const char *exe = proc_get_elf_path(); - if (!exe) { + char exe[LINUX_PATH_MAX]; + if (!proc_elf_path_snapshot(exe, sizeof(exe))) { errno = ENOENT; return -1; } @@ -2566,17 +2571,27 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) * abstraction the rest of the path layer presents. */ if (!strcmp(path, "/proc/self/exe")) { - const char *exe = proc_get_elf_path(); - if (!exe) { + char exe_buf[LINUX_PATH_MAX]; + if (!proc_elf_path_snapshot(exe_buf, sizeof(exe_buf))) { errno = ENOENT; return -1; } + const char *exe = exe_buf; + char exe_real[LINUX_PATH_MAX]; char sysroot_snap[LINUX_PATH_MAX]; if (proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap))) { + /* proc_set_sysroot stores a realpath()-canonicalized form, so + * canonicalize exe before the prefix check or the strip fails + * when /var -> /private/var (and similar macOS symlinks) make + * the two strings diverge. + */ + const char *exe_cmp = exe; + if (realpath(exe, exe_real)) + exe_cmp = exe_real; size_t sr_len = strlen(sysroot_snap); - if (sr_len > 0 && strncmp(exe, sysroot_snap, sr_len) == 0 && - (exe[sr_len] == '/' || exe[sr_len] == '\0')) { - exe += sr_len; + if (sr_len > 0 && !strncmp(exe_cmp, sysroot_snap, sr_len) && + (exe_cmp[sr_len] == '/' || exe_cmp[sr_len] == '\0')) { + exe = exe_cmp + sr_len; if (*exe == '\0') exe = "/"; } diff --git a/src/syscall/exec.c b/src/syscall/exec.c index bee7549..964193b 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -166,9 +166,15 @@ int64_t sys_execve(hv_vcpu_t vcpu, log_debug("execve resolved to \"%s\"", path); } - if (!host_path && path[0] == '/') - path_host = path_resolve_sysroot_path(path, path_host_buf, - sizeof(path_host_buf)); + if (!host_path) { + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, path, PATH_TR_NONE, &tx) < 0) { + err = linux_errno(); + goto fail; + } + str_copy_trunc(path_host_buf, tx.host_path, sizeof(path_host_buf)); + path_host = path_host_buf; + } if (!path_host) { err = -LINUX_ENAMETOOLONG; goto fail; @@ -292,14 +298,15 @@ int64_t sys_execve(hv_vcpu_t vcpu, /* Continue the same exec transaction using the interpreter image. */ str_copy_trunc(path, interp_start, sizeof(path)); - path_host = path; - if (path[0] == '/') - path_host = path_resolve_sysroot_path(path, path_host_buf, - sizeof(path_host_buf)); - if (!path_host) { - err = -LINUX_ENAMETOOLONG; + path_translation_t interp_tx; + if (path_translate_at(LINUX_AT_FDCWD, path, PATH_TR_NONE, &interp_tx) < + 0) { + err = linux_errno(); goto fail; } + str_copy_trunc(path_host_buf, interp_tx.host_path, + sizeof(path_host_buf)); + path_host = path_host_buf; if (elf_load(path_host, &elf_info) < 0) { err = -LINUX_ENOENT; diff --git a/src/syscall/fs-stat.c b/src/syscall/fs-stat.c index b6bd862..a1e17e8 100644 --- a/src/syscall/fs-stat.c +++ b/src/syscall/fs-stat.c @@ -182,16 +182,14 @@ static int64_t stat_at_path(guest_t *g, sizeof(path), &pathp) < 0) return -LINUX_EFAULT; - char proc_path[LINUX_PATH_MAX]; - const char *intercept_path = pathp; - int proc_resolved = - resolve_proc_at_path(dirfd, pathp, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - intercept_path = proc_path; - - if (proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && pathp[0] != '/' && + path_translation_t tx; + if (path_translate_at(dirfd, pathp, + (flags & LINUX_AT_SYMLINK_NOFOLLOW) ? PATH_TR_NOFOLLOW + : PATH_TR_NONE, + &tx) < 0) + return linux_errno(); + + if (tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && pathp[0] != '/' && pathp[0] != '\0' && !proc_get_sysroot()) { int mac_flags = translate_at_flags(flags); if (fstatat(AT_FDCWD, pathp, mac_st, mac_flags) < 0) @@ -205,37 +203,34 @@ static int64_t stat_at_path(guest_t *g, int64_t rc = 0; if ((flags & LINUX_AT_EMPTY_PATH) && pathp[0] == '\0') { - if (dir_ref.fd < 0 || dir_ref.fd == AT_FDCWD) { + /* Linux: AT_EMPTY_PATH with dirfd == AT_FDCWD operates on the + * current working directory. + */ + if (dir_ref.fd == AT_FDCWD) { + int mac_flags = translate_at_flags(flags); + if (fstatat(AT_FDCWD, ".", mac_st, mac_flags) < 0) { + rc = linux_errno(); + goto done; + } + } else if (dir_ref.fd < 0) { rc = -LINUX_EBADF; goto done; - } - if (fstat(dir_ref.fd, mac_st) < 0) { + } else if (fstat(dir_ref.fd, mac_st) < 0) { rc = linux_errno(); goto done; } } else { int intercepted = PROC_NOT_INTERCEPTED; - if (path_might_use_stat_intercept(intercept_path)) { - intercepted = proc_intercept_stat(intercept_path, mac_st); + if (path_might_use_stat_intercept(tx.intercept_path)) { + intercepted = proc_intercept_stat(tx.intercept_path, mac_st); if (intercepted == -1) { rc = linux_errno(); goto done; } } if (intercepted == PROC_NOT_INTERCEPTED) { - char sysroot_buf[LINUX_PATH_MAX]; - const char *stat_path = - (flags & LINUX_AT_SYMLINK_NOFOLLOW) - ? path_resolve_sysroot_nofollow_path(pathp, sysroot_buf, - sizeof(sysroot_buf)) - : path_resolve_sysroot_path(pathp, sysroot_buf, - sizeof(sysroot_buf)); - if (!stat_path) { - rc = -LINUX_ENAMETOOLONG; - goto done; - } int mac_flags = translate_at_flags(flags); - if (fstatat(dir_ref.fd, stat_path, mac_st, mac_flags) < 0) { + if (fstatat(dir_ref.fd, tx.host_path, mac_st, mac_flags) < 0) { rc = linux_errno(); goto done; } @@ -299,14 +294,12 @@ int64_t sys_statfs(guest_t *g, uint64_t path_gva, uint64_t buf_gva) if (guest_read_str_small(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; - char sysroot_buf[LINUX_PATH_MAX]; - const char *fs_path = - path_resolve_sysroot_path(path, sysroot_buf, sizeof(sysroot_buf)); - if (!fs_path) - return -LINUX_ENAMETOOLONG; + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, path, PATH_TR_NONE, &tx) < 0) + return linux_errno(); struct statfs mac_st; - if (statfs(fs_path, &mac_st) < 0) + if (statfs(tx.host_path, &mac_st) < 0) return linux_errno(); linux_statfs_t lin_st; diff --git a/src/syscall/fs-xattr.c b/src/syscall/fs-xattr.c index 765f4ac..f8fdcff 100644 --- a/src/syscall/fs-xattr.c +++ b/src/syscall/fs-xattr.c @@ -74,23 +74,20 @@ int64_t sys_getxattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX], name[LINUX_XATTR_NAME_MAX + 1]; - char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) return -LINUX_EFAULT; - const char *target = nofollow ? path_resolve_sysroot_nofollow_path( - path, sysroot_buf, sizeof(sysroot_buf)) - : path_resolve_sysroot_path( - path, sysroot_buf, sizeof(sysroot_buf)); - if (!target) - return -LINUX_ENAMETOOLONG; + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, path, + nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) + return linux_errno(); int opts = nofollow ? XATTR_NOFOLLOW : 0; if (size == 0) { - ssize_t ret = getxattr(target, name, NULL, 0, 0, opts); + ssize_t ret = getxattr(tx.host_path, name, NULL, 0, 0, opts); return ret < 0 ? linux_errno() : ret; } @@ -99,7 +96,7 @@ int64_t sys_getxattr(guest_t *g, if (err < 0) return err; - ssize_t ret = getxattr(target, name, buf, (size_t) size, 0, opts); + ssize_t ret = getxattr(tx.host_path, name, buf, (size_t) size, 0, opts); int64_t result = xattr_copy_out_result(g, value_gva, buf, ret); free(buf); return result; @@ -114,18 +111,15 @@ int64_t sys_setxattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX], name[LINUX_XATTR_NAME_MAX + 1]; - char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) return -LINUX_EFAULT; - const char *target = nofollow ? path_resolve_sysroot_nofollow_path( - path, sysroot_buf, sizeof(sysroot_buf)) - : path_resolve_sysroot_path( - path, sysroot_buf, sizeof(sysroot_buf)); - if (!target) - return -LINUX_ENAMETOOLONG; + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, path, + nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) + return linux_errno(); void *buf; int64_t err = xattr_alloc_buf(size, &buf); @@ -143,7 +137,7 @@ int64_t sys_setxattr(guest_t *g, return err; } - int ret = setxattr(target, name, buf, (size_t) size, 0, opts); + int ret = setxattr(tx.host_path, name, buf, (size_t) size, 0, opts); free(buf); return ret < 0 ? linux_errno() : 0; } @@ -155,21 +149,18 @@ int64_t sys_listxattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX]; - char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; - const char *target = nofollow ? path_resolve_sysroot_nofollow_path( - path, sysroot_buf, sizeof(sysroot_buf)) - : path_resolve_sysroot_path( - path, sysroot_buf, sizeof(sysroot_buf)); - if (!target) - return -LINUX_ENAMETOOLONG; + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, path, + nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) + return linux_errno(); int opts = nofollow ? XATTR_NOFOLLOW : 0; if (size == 0) { - ssize_t ret = listxattr(target, NULL, 0, opts); + ssize_t ret = listxattr(tx.host_path, NULL, 0, opts); return ret < 0 ? linux_errno() : ret; } @@ -178,7 +169,7 @@ int64_t sys_listxattr(guest_t *g, if (err < 0) return err; - ssize_t ret = listxattr(target, buf, (size_t) size, opts); + ssize_t ret = listxattr(tx.host_path, buf, (size_t) size, opts); int64_t result = xattr_copy_out_result(g, list_gva, buf, ret); free(buf); return result; @@ -190,21 +181,18 @@ int64_t sys_removexattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX], name[LINUX_XATTR_NAME_MAX + 1]; - char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) return -LINUX_EFAULT; - const char *target = nofollow ? path_resolve_sysroot_nofollow_path( - path, sysroot_buf, sizeof(sysroot_buf)) - : path_resolve_sysroot_path( - path, sysroot_buf, sizeof(sysroot_buf)); - if (!target) - return -LINUX_ENAMETOOLONG; + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, path, + nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) + return linux_errno(); int opts = nofollow ? XATTR_NOFOLLOW : 0; - int ret = removexattr(target, name, opts); + int ret = removexattr(tx.host_path, name, opts); return ret < 0 ? linux_errno() : 0; } diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 6f90719..396afad 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -32,6 +32,7 @@ #include "syscall/net.h" /* absock_unregister_fd */ #include "syscall/path.h" #include "syscall/proc.h" +#include "syscall/sidecar.h" /* Linux dirent64 layout. */ typedef struct { @@ -208,6 +209,20 @@ static int fd_alloc_opened_host(int host_fd, return guest_fd; } +static int64_t read_translated_path(guest_t *g, + int dirfd, + uint64_t path_gva, + unsigned int tx_flags, + char path[LINUX_PATH_MAX], + path_translation_t *tx) +{ + if (guest_read_str(g, path_gva, path, LINUX_PATH_MAX) < 0) + return -LINUX_EFAULT; + if (path_translate_at(dirfd, path, tx_flags, tx) < 0) + return linux_errno(); + return 0; +} + /* open/close. */ int64_t sys_openat_path(guest_t *g, @@ -216,20 +231,37 @@ int64_t sys_openat_path(guest_t *g, int linux_flags, int mode) { - char proc_path[LINUX_PATH_MAX]; - const char *guest_path = pathp; - const char *intercept_path = pathp; - int proc_resolved = - resolve_proc_at_path(dirfd, pathp, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) { - guest_path = proc_path; - intercept_path = proc_path; + if (linux_flags & LINUX_O_CREAT) { + int sidecar_fd = + sidecar_openat(dirfd, pathp, linux_flags, (mode_t) mode); + if (sidecar_fd != (int) SIDECAR_NOT_HANDLED) { + if (sidecar_fd < 0) + return linux_errno(); + int type = opened_fd_type(sidecar_fd, linux_flags); + if (type < 0) { + close_keep_errno(sidecar_fd); + return linux_errno(); + } + int guest_fd = + fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1); + if (guest_fd < 0) { + close_keep_errno(sidecar_fd); + return linux_errno(); + } + return guest_fd; + } } + path_translation_t tx; + unsigned int tx_flags = + (linux_flags & LINUX_O_NOFOLLOW) ? PATH_TR_NOFOLLOW : PATH_TR_NONE; + if (linux_flags & LINUX_O_CREAT) + tx_flags = PATH_TR_CREATE | PATH_TR_CREATE_PARENTS; + if (path_translate_at(dirfd, pathp, tx_flags, &tx) < 0) + return linux_errno(); + int flags = translate_open_flags(linux_flags); - if (proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && pathp[0] != '/' && + if (tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && pathp[0] != '/' && !proc_get_sysroot()) { int host_fd = openat(AT_FDCWD, pathp, flags, mode); if (host_fd < 0) @@ -249,9 +281,9 @@ int64_t sys_openat_path(guest_t *g, } /* Intercept /proc and /dev paths before touching the host filesystem */ - if (path_might_use_open_intercept(intercept_path)) { + if (path_might_use_open_intercept(tx.intercept_path)) { int intercepted = - proc_intercept_open(g, intercept_path, linux_flags, mode); + proc_intercept_open(g, tx.intercept_path, linux_flags, mode); if (intercepted >= 0) { /* Got a host fd from the intercept. Device nodes (/dev/...) use * fd_alloc() for POSIX lowest-fd semantics because busybox sh @@ -265,14 +297,14 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int min_guest_fd = - (!strncmp(intercept_path, "/dev/", 5)) ? -1 : 128; + (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128; int guest_fd = fd_alloc_opened_host(intercepted, type, linux_flags, min_guest_fd); if (guest_fd < 0) { close_keep_errno(intercepted); return linux_errno(); } - fd_note_proc_path(guest_fd, intercept_path); + fd_note_proc_path(guest_fd, tx.intercept_path); return guest_fd; } if (intercepted == -1) { @@ -282,18 +314,8 @@ int64_t sys_openat_path(guest_t *g, /* intercepted == PROC_NOT_INTERCEPTED: fall through to real openat */ } - char sysroot_buf[LINUX_PATH_MAX]; - const char *open_path = - (linux_flags & LINUX_O_NOFOLLOW) - ? path_resolve_sysroot_nofollow_path(guest_path, sysroot_buf, - sizeof(sysroot_buf)) - : path_resolve_sysroot_path(guest_path, sysroot_buf, - sizeof(sysroot_buf)); - if (!open_path) - return -LINUX_ENAMETOOLONG; - if (dirfd == LINUX_AT_FDCWD) { - int host_fd = open(open_path, flags, mode); + int host_fd = open(tx.host_path, flags, mode); if (host_fd < 0) return linux_errno(); @@ -314,7 +336,7 @@ int64_t sys_openat_path(guest_t *g, if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - int host_fd = openat(dir_ref.fd, open_path, flags, mode); + int host_fd = openat(dir_ref.fd, tx.host_path, flags, mode); host_fd_ref_close(&dir_ref); if (host_fd < 0) return linux_errno(); @@ -748,7 +770,15 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) if (!de) break; - size_t name_len = strlen(de->d_name); + char guest_name[NAME_MAX + 1]; + int name_rc = path_translate_dirent_name(fd, de->d_name, guest_name, + sizeof(guest_name)); + if (name_rc > 0) + continue; + if (name_rc < 0) + return guest_pos > 0 ? (int64_t) guest_pos : linux_errno(); + + size_t name_len = strlen(guest_name); /* Linux dirent64: 19-byte header + name + null, padded to 8 */ size_t reclen = (19 + name_len + 1 + 7) & ~7ULL; @@ -768,7 +798,7 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) * guest_write() which handles 2MiB block boundary crossings. */ memcpy(entry_buf, &lde, sizeof(lde)); - memcpy(entry_buf + 19, de->d_name, name_len + 1); + memcpy(entry_buf + 19, guest_name, name_len + 1); size_t pad_start = 19 + name_len + 1; if (pad_start < reclen) memset(entry_buf + pad_start, 0, reclen - pad_start); @@ -784,29 +814,25 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) int64_t sys_chdir(guest_t *g, uint64_t path_gva) { - char path[LINUX_PATH_MAX], proc_path[LINUX_PATH_MAX]; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; + char path[LINUX_PATH_MAX]; + path_translation_t tx; + int64_t rc = read_translated_path(g, LINUX_AT_FDCWD, path_gva, PATH_TR_NONE, + path, &tx); + if (rc < 0) + return rc; char proc_virt[64]; - const char *chdir_path = path; - int proc_resolved = resolve_proc_at_path(LINUX_AT_FDCWD, path, proc_path, - sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - chdir_path = proc_path; - const char *virt = - proc_virtual_dir_path(chdir_path, proc_virt, sizeof(proc_virt)); + proc_virtual_dir_path(tx.guest_path, proc_virt, sizeof(proc_virt)); if (virt) { - int host_fd = proc_intercept_open(g, chdir_path, LINUX_O_DIRECTORY, 0); + int host_fd = + proc_intercept_open(g, tx.intercept_path, LINUX_O_DIRECTORY, 0); if (host_fd < 0) return linux_errno(); - int rc = fchdir(host_fd); + int chdir_rc = fchdir(host_fd); int saved_errno = errno; close_keep_errno(host_fd); - if (rc < 0) { + if (chdir_rc < 0) { errno = saved_errno; return linux_errno(); } @@ -814,12 +840,7 @@ int64_t sys_chdir(guest_t *g, uint64_t path_gva) return 0; } - char sysroot_buf[LINUX_PATH_MAX]; - const char *target = - path_resolve_sysroot_path(chdir_path, sysroot_buf, sizeof(sysroot_buf)); - if (!target) - return -LINUX_ENAMETOOLONG; - if (chdir(target) < 0) + if (chdir(tx.host_path) < 0) return linux_errno(); if (proc_cwd_refresh() < 0) @@ -941,21 +962,16 @@ int64_t sys_readlinkat(guest_t *g, uint64_t bufsiz) { char path[LINUX_PATH_MAX]; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; - char proc_path[LINUX_PATH_MAX]; - const char *intercept_path = path; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - intercept_path = proc_path; + path_translation_t tx; + int64_t rc = + read_translated_path(g, dirfd, path_gva, PATH_TR_NOFOLLOW, path, &tx); + if (rc < 0) + return rc; /* Intercept /proc paths (e.g. /proc/self/exe, /proc/self/fd/N) */ char link[LINUX_PATH_MAX]; int intercepted = - proc_intercept_readlink(intercept_path, link, sizeof(link)); + proc_intercept_readlink(tx.intercept_path, link, sizeof(link)); if (intercepted >= 0) { size_t copy_len = (size_t) intercepted < bufsiz ? (size_t) intercepted : bufsiz; @@ -973,14 +989,7 @@ int64_t sys_readlinkat(guest_t *g, return -LINUX_EBADF; /* Apply sysroot redirect for absolute paths */ - char sysroot_buf[LINUX_PATH_MAX]; - const char *read_path = path_resolve_sysroot_nofollow_path( - path, sysroot_buf, sizeof(sysroot_buf)); - if (!read_path) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - ssize_t len = readlinkat(dir_ref.fd, read_path, link, sizeof(link) - 1); + ssize_t len = readlinkat(dir_ref.fd, tx.host_path, link, sizeof(link) - 1); host_fd_ref_close(&dir_ref); if (len < 0) return linux_errno(); @@ -995,20 +1004,22 @@ int64_t sys_readlinkat(guest_t *g, int64_t sys_unlinkat(guest_t *g, int dirfd, uint64_t path_gva, int flags) { char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *unlink_guest_path = path; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - unlink_guest_path = proc_path; if (!validate_at_flags(flags, LINUX_AT_REMOVEDIR)) return -LINUX_EINVAL; + int64_t sidecar_rc = sidecar_unlinkat(dirfd, path, flags); + if (sidecar_rc != SIDECAR_NOT_HANDLED) + return sidecar_rc; + + path_translation_t tx; + int64_t rc = + read_translated_path(g, dirfd, path_gva, PATH_TR_CREATE, path, &tx); + if (rc < 0) + return rc; + host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; @@ -1016,8 +1027,8 @@ int64_t sys_unlinkat(guest_t *g, int dirfd, uint64_t path_gva, int flags) /* Rewrite /dev/shm/ to the host temp directory so shm_unlink works */ const char *unlink_path; char shm_host[LINUX_PATH_MAX]; - if (!strncmp(unlink_guest_path, "/dev/shm/", 9)) { - if (proc_dev_shm_resolve(unlink_guest_path + 9, shm_host, + if (!strncmp(tx.guest_path, "/dev/shm/", 9)) { + if (proc_dev_shm_resolve(tx.guest_path + 9, shm_host, sizeof(shm_host)) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); @@ -1027,14 +1038,7 @@ int64_t sys_unlinkat(guest_t *g, int dirfd, uint64_t path_gva, int flags) dir_ref.fd = AT_FDCWD; dir_ref.owned = 0; } else { - char sysroot_buf[LINUX_PATH_MAX]; - const char *resolved = path_resolve_sysroot_create_path( - unlink_guest_path, sysroot_buf, sizeof(sysroot_buf)); - if (!resolved) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - unlink_path = resolved; + unlink_path = tx.host_path; } int host_flags = translate_at_flags(flags); @@ -1050,30 +1054,24 @@ int64_t sys_unlinkat(guest_t *g, int dirfd, uint64_t path_gva, int flags) int64_t sys_mkdirat(guest_t *g, int dirfd, uint64_t path_gva, int mode) { char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *mkdir_path = path; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - mkdir_path = proc_path; + + int64_t sidecar_rc = sidecar_mkdirat(dirfd, path, (mode_t) mode); + if (sidecar_rc != SIDECAR_NOT_HANDLED) + return sidecar_rc; + + path_translation_t tx; + int64_t rc = read_translated_path( + g, dirfd, path_gva, PATH_TR_CREATE | PATH_TR_CREATE_PARENTS, path, &tx); + if (rc < 0) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - char sysroot_buf[LINUX_PATH_MAX]; - const char *resolved = path_resolve_sysroot_create_path( - mkdir_path, sysroot_buf, sizeof(sysroot_buf)); - if (!resolved) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - - if (mkdirat(dir_ref.fd, resolved, (mode_t) mode) < 0) { + if (mkdirat(dir_ref.fd, tx.host_path, (mode_t) mode) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1103,25 +1101,24 @@ int64_t sys_renameat2(guest_t *g, int flags) { char oldpath[LINUX_PATH_MAX], newpath[LINUX_PATH_MAX]; - char old_proc_path[LINUX_PATH_MAX], new_proc_path[LINUX_PATH_MAX]; - const char *old_guest_path = oldpath; - const char *new_guest_path = newpath; - if (guest_read_str(g, oldpath_gva, oldpath, sizeof(oldpath)) < 0) - return -LINUX_EFAULT; - if (guest_read_str(g, newpath_gva, newpath, sizeof(newpath)) < 0) + path_translation_t old_tx, new_tx; + if (guest_read_str(g, oldpath_gva, oldpath, sizeof(oldpath)) < 0 || + guest_read_str(g, newpath_gva, newpath, sizeof(newpath)) < 0) return -LINUX_EFAULT; - int old_proc_resolved = resolve_proc_at_path( - olddirfd, oldpath, old_proc_path, sizeof(old_proc_path)); - if (old_proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (old_proc_resolved > 0) - old_guest_path = old_proc_path; - int new_proc_resolved = resolve_proc_at_path( - newdirfd, newpath, new_proc_path, sizeof(new_proc_path)); - if (new_proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (new_proc_resolved > 0) - new_guest_path = new_proc_path; + + if ((flags & ~(LINUX_RENAME_NOREPLACE | LINUX_RENAME_EXCHANGE)) || + ((flags & LINUX_RENAME_NOREPLACE) && (flags & LINUX_RENAME_EXCHANGE))) { + return -LINUX_EINVAL; + } + + int64_t sidecar_rc = + sidecar_renameat(olddirfd, oldpath, newdirfd, newpath, flags); + if (sidecar_rc != SIDECAR_NOT_HANDLED) + return sidecar_rc; + + if (path_translate_at(olddirfd, oldpath, PATH_TR_NONE, &old_tx) < 0 || + path_translate_at(newdirfd, newpath, PATH_TR_CREATE, &new_tx) < 0) + return linux_errno(); host_fd_ref_t olddir_ref, newdir_ref; if (host_dirfd_ref_open(olddirfd, &olddir_ref) < 0) @@ -1131,29 +1128,15 @@ int64_t sys_renameat2(guest_t *g, return -LINUX_EBADF; } - if ((flags & ~(LINUX_RENAME_NOREPLACE | LINUX_RENAME_EXCHANGE)) || - ((flags & LINUX_RENAME_NOREPLACE) && (flags & LINUX_RENAME_EXCHANGE))) { - return close_dir_refs_result(&olddir_ref, &newdir_ref, -LINUX_EINVAL); - } - /* Apply sysroot resolution for absolute paths */ - char old_sysroot[LINUX_PATH_MAX], new_sysroot[LINUX_PATH_MAX]; - const char *old_resolved = path_resolve_sysroot_path( - old_guest_path, old_sysroot, sizeof(old_sysroot)); - const char *new_resolved = path_resolve_sysroot_create_path( - new_guest_path, new_sysroot, sizeof(new_sysroot)); - if (!old_resolved || !new_resolved) { - return close_dir_refs_result(&olddir_ref, &newdir_ref, - -LINUX_ENAMETOOLONG); - } - /* RENAME_NOREPLACE: fail if destination exists. macOS renamex_np * supports RENAME_EXCL for the same semantics. Only supported for * AT_FDCWD paths (renamex_np does not take dirfd arguments). */ if (flags & LINUX_RENAME_NOREPLACE) { if (olddirfd == LINUX_AT_FDCWD && newdirfd == LINUX_AT_FDCWD) { - if (renamex_np(old_resolved, new_resolved, RENAME_EXCL) < 0) { + if (renamex_np(old_tx.host_path, new_tx.host_path, RENAME_EXCL) < + 0) { return close_dir_refs_result(&olddir_ref, &newdir_ref, linux_errno()); } @@ -1164,14 +1147,14 @@ int64_t sys_renameat2(guest_t *g, * requirement. This path still cannot handle directories because * hardlinking directories is not allowed. */ - if (linkat(olddir_ref.fd, old_resolved, newdir_ref.fd, new_resolved, - 0) < 0) { + if (linkat(olddir_ref.fd, old_tx.host_path, newdir_ref.fd, + new_tx.host_path, 0) < 0) { return close_dir_refs_result(&olddir_ref, &newdir_ref, linux_errno()); } - if (unlinkat(olddir_ref.fd, old_resolved, 0) < 0) { + if (unlinkat(olddir_ref.fd, old_tx.host_path, 0) < 0) { int err = errno; - (void) unlinkat(newdir_ref.fd, new_resolved, 0); + (void) unlinkat(newdir_ref.fd, new_tx.host_path, 0); errno = err; return close_dir_refs_result(&olddir_ref, &newdir_ref, linux_errno()); @@ -1183,7 +1166,8 @@ int64_t sys_renameat2(guest_t *g, */ if (flags & LINUX_RENAME_EXCHANGE) { if (olddirfd == LINUX_AT_FDCWD && newdirfd == LINUX_AT_FDCWD) { - if (renamex_np(old_resolved, new_resolved, RENAME_SWAP) < 0) { + if (renamex_np(old_tx.host_path, new_tx.host_path, RENAME_SWAP) < + 0) { return close_dir_refs_result(&olddir_ref, &newdir_ref, linux_errno()); } @@ -1195,15 +1179,15 @@ int64_t sys_renameat2(guest_t *g, } if (olddirfd == LINUX_AT_FDCWD && newdirfd == LINUX_AT_FDCWD) { - if (rename(old_resolved, new_resolved) < 0) { + if (rename(old_tx.host_path, new_tx.host_path) < 0) { return close_dir_refs_result(&olddir_ref, &newdir_ref, linux_errno()); } return close_dir_refs_result(&olddir_ref, &newdir_ref, 0); } - if (renameat(olddir_ref.fd, old_resolved, newdir_ref.fd, new_resolved) < - 0) { + if (renameat(olddir_ref.fd, old_tx.host_path, newdir_ref.fd, + new_tx.host_path) < 0) { return close_dir_refs_result(&olddir_ref, &newdir_ref, linux_errno()); } return close_dir_refs_result(&olddir_ref, &newdir_ref, 0); @@ -1213,32 +1197,19 @@ int64_t sys_mknodat(guest_t *g, int dirfd, uint64_t path_gva, int mode, int dev) { (void) dev; char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *node_path = path; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - node_path = proc_path; + path_translation_t tx; + int64_t rc = read_translated_path( + g, dirfd, path_gva, PATH_TR_CREATE | PATH_TR_CREATE_PARENTS, path, &tx); + if (rc < 0) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - char sysroot_buf[LINUX_PATH_MAX]; - const char *target = path_resolve_sysroot_create_path( - node_path, sysroot_buf, sizeof(sysroot_buf)); - if (!target) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - /* Only support FIFO creation; other node types need root */ if (S_ISFIFO(mode)) { - if (mkfifoat(dir_ref.fd, target, mode & 0777) < 0) { + if (mkfifoat(dir_ref.fd, tx.host_path, mode & 0777) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1248,7 +1219,7 @@ int64_t sys_mknodat(guest_t *g, int dirfd, uint64_t path_gva, int mode, int dev) /* Regular files: create an empty file */ if (S_ISREG(mode) || (mode & S_IFMT) == 0) { - int fd = openat(dir_ref.fd, target, O_CREAT | O_WRONLY | O_EXCL, + int fd = openat(dir_ref.fd, tx.host_path, O_CREAT | O_WRONLY | O_EXCL, mode & 0777); host_fd_ref_close(&dir_ref); if (fd < 0) @@ -1267,33 +1238,21 @@ int64_t sys_symlinkat(guest_t *g, uint64_t linkpath_gva) { char target[LINUX_PATH_MAX], linkpath[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *link_guest_path = linkpath; if (guest_read_str(g, target_gva, target, sizeof(target)) < 0) return -LINUX_EFAULT; - if (guest_read_str(g, linkpath_gva, linkpath, sizeof(linkpath)) < 0) - return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, linkpath, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - link_guest_path = proc_path; + path_translation_t tx; + int64_t rc = read_translated_path(g, dirfd, linkpath_gva, + PATH_TR_CREATE | PATH_TR_CREATE_PARENTS, + linkpath, &tx); + if (rc < 0) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; /* Resolve linkpath (the new symlink location) through sysroot */ - char sysroot_buf[LINUX_PATH_MAX]; - const char *resolved = path_resolve_sysroot_create_path( - link_guest_path, sysroot_buf, sizeof(sysroot_buf)); - if (!resolved) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - - if (symlinkat(target, dir_ref.fd, resolved) < 0) { + if (symlinkat(target, dir_ref.fd, tx.host_path) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1310,29 +1269,23 @@ int64_t sys_linkat(guest_t *g, int flags) { char oldpath[LINUX_PATH_MAX], newpath[LINUX_PATH_MAX]; - char old_proc_path[LINUX_PATH_MAX], new_proc_path[LINUX_PATH_MAX]; - const char *old_guest_path = oldpath; - const char *new_guest_path = newpath; - if (guest_read_str(g, oldpath_gva, oldpath, sizeof(oldpath)) < 0) - return -LINUX_EFAULT; - if (guest_read_str(g, newpath_gva, newpath, sizeof(newpath)) < 0) + path_translation_t old_tx, new_tx; + if (guest_read_str(g, oldpath_gva, oldpath, sizeof(oldpath)) < 0 || + guest_read_str(g, newpath_gva, newpath, sizeof(newpath)) < 0) return -LINUX_EFAULT; - int old_proc_resolved = resolve_proc_at_path( - olddirfd, oldpath, old_proc_path, sizeof(old_proc_path)); - if (old_proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (old_proc_resolved > 0) - old_guest_path = old_proc_path; - int new_proc_resolved = resolve_proc_at_path( - newdirfd, newpath, new_proc_path, sizeof(new_proc_path)); - if (new_proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (new_proc_resolved > 0) - new_guest_path = new_proc_path; if (!validate_at_flags(flags, LINUX_AT_SYMLINK_FOLLOW)) return -LINUX_EINVAL; + int64_t sidecar_rc = + sidecar_linkat(olddirfd, oldpath, newdirfd, newpath, flags); + if (sidecar_rc != SIDECAR_NOT_HANDLED) + return sidecar_rc; + + if (path_translate_at(olddirfd, oldpath, PATH_TR_NONE, &old_tx) < 0 || + path_translate_at(newdirfd, newpath, PATH_TR_CREATE, &new_tx) < 0) + return linux_errno(); + host_fd_ref_t olddir_ref, newdir_ref; if (host_dirfd_ref_open(olddirfd, &olddir_ref) < 0) return -LINUX_EBADF; @@ -1342,19 +1295,8 @@ int64_t sys_linkat(guest_t *g, } /* Resolve both paths through sysroot */ - char old_sr[LINUX_PATH_MAX], new_sr[LINUX_PATH_MAX]; - const char *old_resolved = - path_resolve_sysroot_path(old_guest_path, old_sr, sizeof(old_sr)); - const char *new_resolved = path_resolve_sysroot_create_path( - new_guest_path, new_sr, sizeof(new_sr)); - if (!old_resolved || !new_resolved) { - host_fd_ref_close(&olddir_ref); - host_fd_ref_close(&newdir_ref); - return -LINUX_ENAMETOOLONG; - } - int mac_flags = translate_at_flags(flags); - if (linkat(olddir_ref.fd, old_resolved, newdir_ref.fd, new_resolved, + if (linkat(olddir_ref.fd, old_tx.host_path, newdir_ref.fd, new_tx.host_path, mac_flags) < 0) { host_fd_ref_close(&olddir_ref); host_fd_ref_close(&newdir_ref); @@ -1384,21 +1326,18 @@ int64_t sys_faccessat(guest_t *g, } char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *access_path = path; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - access_path = proc_path; + path_translation_t tx; + int64_t rc = read_translated_path( + g, dirfd, path_gva, + (flags & LINUX_AT_SYMLINK_NOFOLLOW) ? PATH_TR_NOFOLLOW : PATH_TR_NONE, + path, &tx); + if (rc < 0) + return rc; if (!validate_at_flags(flags, LINUX_AT_EACCESS | LINUX_AT_SYMLINK_NOFOLLOW)) return -LINUX_EINVAL; - if (proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && path[0] != '/') { + if (tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && path[0] != '/') { int mac_flags = translate_faccessat_flags(flags); if (faccessat(AT_FDCWD, path, mode, mac_flags) < 0) return linux_errno(); @@ -1414,28 +1353,16 @@ int64_t sys_faccessat(guest_t *g, * mode bits, not just path existence. */ struct stat intercepted_st; - if (path_might_use_stat_intercept(access_path) && - proc_intercept_stat(access_path, &intercepted_st) == 0) { + if (path_might_use_stat_intercept(tx.intercept_path) && + proc_intercept_stat(tx.intercept_path, &intercepted_st) == 0) { host_fd_ref_close(&dir_ref); if (path_check_intercept_access(&intercepted_st, mode, flags) < 0) return linux_errno(); return 0; } - char sysroot_buf[LINUX_PATH_MAX]; - const char *check_path = - (flags & LINUX_AT_SYMLINK_NOFOLLOW) - ? path_resolve_sysroot_nofollow_path(access_path, sysroot_buf, - sizeof(sysroot_buf)) - : path_resolve_sysroot_path(access_path, sysroot_buf, - sizeof(sysroot_buf)); - if (!check_path) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - int mac_flags = translate_faccessat_flags(flags); - if (faccessat(dir_ref.fd, check_path, mode, mac_flags) < 0) { + if (faccessat(dir_ref.fd, tx.host_path, mode, mac_flags) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1489,24 +1416,13 @@ int64_t sys_ftruncate(int fd, int64_t length) int64_t sys_truncate(guest_t *g, uint64_t path_gva, int64_t length) { char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *trunc_guest_path = path; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; - int proc_resolved = resolve_proc_at_path(LINUX_AT_FDCWD, path, proc_path, - sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - trunc_guest_path = proc_path; - - char sysroot_buf[LINUX_PATH_MAX]; - const char *trunc_path = path_resolve_sysroot_path( - trunc_guest_path, sysroot_buf, sizeof(sysroot_buf)); - if (!trunc_path) - return -LINUX_ENAMETOOLONG; - - if (truncate(trunc_path, length) < 0) + path_translation_t tx; + int64_t rc = read_translated_path(g, LINUX_AT_FDCWD, path_gva, PATH_TR_NONE, + path, &tx); + if (rc < 0) + return rc; + + if (truncate(tx.host_path, length) < 0) return linux_errno(); return 0; } @@ -1537,38 +1453,22 @@ int64_t sys_fchmodat(guest_t *g, int flags) { char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *chmod_path = path; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - chmod_path = proc_path; - if (!validate_at_flags(flags, LINUX_AT_SYMLINK_NOFOLLOW)) return -LINUX_EINVAL; + path_translation_t tx; + int64_t rc = read_translated_path( + g, dirfd, path_gva, + (flags & LINUX_AT_SYMLINK_NOFOLLOW) ? PATH_TR_NOFOLLOW : PATH_TR_NONE, + path, &tx); + if (rc < 0) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - char sysroot_buf[LINUX_PATH_MAX]; - const char *target = - (flags & LINUX_AT_SYMLINK_NOFOLLOW) - ? path_resolve_sysroot_nofollow_path(chmod_path, sysroot_buf, - sizeof(sysroot_buf)) - : path_resolve_sysroot_path(chmod_path, sysroot_buf, - sizeof(sysroot_buf)); - if (!target) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - int mac_flags = translate_at_flags(flags); - if (fchmodat(dir_ref.fd, target, mode, mac_flags) < 0) { + if (fchmodat(dir_ref.fd, tx.host_path, mode, mac_flags) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1585,38 +1485,22 @@ int64_t sys_fchownat(guest_t *g, int flags) { char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - const char *chown_path = path; - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) - return -LINUX_EFAULT; - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) - return -LINUX_ENAMETOOLONG; - if (proc_resolved > 0) - chown_path = proc_path; - if (!validate_at_flags(flags, LINUX_AT_SYMLINK_NOFOLLOW)) return -LINUX_EINVAL; + path_translation_t tx; + int64_t rc = read_translated_path( + g, dirfd, path_gva, + (flags & LINUX_AT_SYMLINK_NOFOLLOW) ? PATH_TR_NOFOLLOW : PATH_TR_NONE, + path, &tx); + if (rc < 0) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - char sysroot_buf[LINUX_PATH_MAX]; - const char *target = - (flags & LINUX_AT_SYMLINK_NOFOLLOW) - ? path_resolve_sysroot_nofollow_path(chown_path, sysroot_buf, - sizeof(sysroot_buf)) - : path_resolve_sysroot_path(chown_path, sysroot_buf, - sizeof(sysroot_buf)); - if (!target) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - int mac_flags = translate_at_flags(flags); - if (fchownat(dir_ref.fd, target, owner, group, mac_flags) < 0) { + if (fchownat(dir_ref.fd, tx.host_path, owner, group, mac_flags) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1659,29 +1543,18 @@ int64_t sys_utimensat(guest_t *g, /* If path is NULL (path_gva == 0), operate on the dirfd itself */ const char *path_arg = NULL; char path[LINUX_PATH_MAX]; - char proc_path[LINUX_PATH_MAX]; - char sysroot_buf[LINUX_PATH_MAX]; + path_translation_t tx; if (path_gva != 0) { - if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) { - host_fd_ref_close(&dir_ref); - return -LINUX_EFAULT; - } - int proc_resolved = - resolve_proc_at_path(dirfd, path, proc_path, sizeof(proc_path)); - if (proc_resolved < 0) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } - const char *src = proc_resolved > 0 ? proc_path : path; - path_arg = (flags & LINUX_AT_SYMLINK_NOFOLLOW) - ? path_resolve_sysroot_nofollow_path(src, sysroot_buf, - sizeof(sysroot_buf)) - : path_resolve_sysroot_path(src, sysroot_buf, - sizeof(sysroot_buf)); - if (!path_arg) { + int64_t rc = read_translated_path(g, dirfd, path_gva, + (flags & LINUX_AT_SYMLINK_NOFOLLOW) + ? PATH_TR_NOFOLLOW + : PATH_TR_NONE, + path, &tx); + if (rc < 0) { host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; + return rc; } + path_arg = tx.host_path; } struct timespec ts[2]; diff --git a/src/syscall/path.c b/src/syscall/path.c index dfed81d..971accf 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -19,6 +19,7 @@ #include "syscall/abi.h" #include "syscall/path.h" #include "syscall/proc.h" +#include "syscall/sidecar.h" #ifndef MAXSYMLINKS #define MAXSYMLINKS 40 @@ -123,6 +124,108 @@ int path_check_intercept_access(const struct stat *st, int mode, int flags) return -1; } +int path_translate_at(guest_fd_t dirfd, + const char *path, + unsigned int flags, + path_translation_t *tx) +{ + if (!tx) { + errno = EINVAL; + return -1; + } + + memset(tx, 0, sizeof(*tx)); + tx->guest_path = path; + tx->intercept_path = path; + tx->host_path = path; + + if (!path) + return 0; + + tx->proc_resolved = + resolve_proc_at_path(dirfd, path, tx->proc_path, sizeof(tx->proc_path)); + if (tx->proc_resolved < 0) + return -1; + if (tx->proc_resolved > 0) { + tx->guest_path = tx->proc_path; + tx->intercept_path = tx->proc_path; + } + + errno = 0; + if ((flags & PATH_TR_CREATE) && sidecar_active() && + sidecar_path_targets_reserved_name(tx->guest_path)) { + errno = ENOENT; + return -1; + } + + if (flags & PATH_TR_CREATE) { + tx->host_path = path_resolve_sysroot_create_path( + tx->guest_path, tx->host_buf, sizeof(tx->host_buf), + (flags & PATH_TR_CREATE_PARENTS) != 0); + } else if (flags & PATH_TR_NOFOLLOW) { + tx->host_path = path_resolve_sysroot_nofollow_path( + tx->guest_path, tx->host_buf, sizeof(tx->host_buf)); + } else { + tx->host_path = path_resolve_sysroot_path(tx->guest_path, tx->host_buf, + sizeof(tx->host_buf)); + } + + /* Sidecar only runs after sysroot resolution succeeds. If the resolver + * rejected the path (e.g. nofollow containment violation), sidecar must not + * be allowed to walk an alternate index and resurrect the rejected target. + */ + if (tx->host_path && !(flags & PATH_TR_CREATE)) { + int sidecar_rc = sidecar_translate_lookup_at( + dirfd, tx->guest_path, tx->host_buf, sizeof(tx->host_buf)); + if (sidecar_rc < 0) + return -1; + if (sidecar_rc > 0) + tx->host_path = tx->host_buf; + } + + if (!tx->host_path) { + /* Resolvers set errno on every failure path; only synthesize one if a + * future caller forgets, so the error class survives instead of being + * flattened to ENAMETOOLONG. + */ + if (errno == 0) + errno = ENAMETOOLONG; + return -1; + } + + return 0; +} + +int path_translate_dirent_name(guest_fd_t dirfd, + const char *host_name, + char *guest_name, + size_t guest_name_sz) +{ + if (!host_name || !guest_name || guest_name_sz == 0) { + errno = EINVAL; + return -1; + } + + guest_name[0] = '\0'; + int sidecar_rc = sidecar_translate_dirent_name(dirfd, host_name, guest_name, + guest_name_sz); + if (sidecar_rc < 0) + return sidecar_rc; + if (sidecar_rc > 0) + return sidecar_rc; + if (guest_name[0] != '\0') + return 0; + + size_t len = strlen(host_name); + if (len + 1 > guest_name_sz) { + errno = ENAMETOOLONG; + return -1; + } + + memcpy(guest_name, host_name, len + 1); + return 0; +} + static bool path_next_component(const char **pathp, const char **comp, size_t *len) @@ -163,9 +266,10 @@ const char *path_resolve_sysroot_nofollow_path(const char *path, const char *path_resolve_sysroot_create_path(const char *path, char *buf, - size_t bufsz) + size_t bufsz, + bool create_parents) { - return proc_resolve_sysroot_create_path(path, buf, bufsz); + return proc_resolve_sysroot_create_path(path, buf, bufsz, create_parents); } int sys_path_has_symlink(guest_fd_t dirfd, const char *path) diff --git a/src/syscall/path.h b/src/syscall/path.h index 29e1715..0b95f23 100644 --- a/src/syscall/path.h +++ b/src/syscall/path.h @@ -13,9 +13,33 @@ #include "syscall/internal.h" +typedef enum { + PATH_TR_NONE = 0, + PATH_TR_NOFOLLOW = 1u << 0, + PATH_TR_CREATE = 1u << 1, + PATH_TR_CREATE_PARENTS = 1u << 2, +} path_translate_flags_t; + +typedef struct { + const char *guest_path; + const char *intercept_path; + const char *host_path; + int proc_resolved; + char proc_path[LINUX_PATH_MAX]; + char host_buf[LINUX_PATH_MAX]; +} path_translation_t; + bool path_might_use_open_intercept(const char *path); bool path_might_use_stat_intercept(const char *path); int path_check_intercept_access(const struct stat *st, int mode, int flags); +int path_translate_at(guest_fd_t dirfd, + const char *path, + unsigned int flags, + path_translation_t *tx); +int path_translate_dirent_name(guest_fd_t dirfd, + const char *host_name, + char *guest_name, + size_t guest_name_sz); int resolve_proc_at_path(guest_fd_t dirfd, const char *path, char *out, @@ -34,7 +58,8 @@ const char *path_resolve_sysroot_nofollow_path(const char *path, size_t bufsz); const char *path_resolve_sysroot_create_path(const char *path, char *buf, - size_t bufsz); + size_t bufsz, + bool create_parents); int path_openat2_stays_beneath(const char *path, bool clamp_at_root); int path_openat2_normalize_in_root(const char *path, char *out, size_t outsz); diff --git a/src/syscall/proc-state.c b/src/syscall/proc-state.c index 001a8e4..bca978d 100644 --- a/src/syscall/proc-state.c +++ b/src/syscall/proc-state.c @@ -16,6 +16,8 @@ #include "utils.h" +#include "core/sysroot.h" + #include "runtime/thread.h" #include "syscall/internal.h" @@ -30,6 +32,11 @@ static bool shim_blob_owned = false; /* Current ELF and launcher paths. */ static char elf_path[LINUX_PATH_MAX] = {0}; static char elfuse_path[LINUX_PATH_MAX] = {0}; +/* Serializes proc_set_elf_path against readers that consume the string. + * Without this, an execve on one vCPU can tear the buffer underneath a + * sibling vCPU resolving /proc/self/exe. + */ +static pthread_mutex_t elf_path_lock = PTHREAD_MUTEX_INITIALIZER; /* Guest process metadata snapshots. */ static char cmdline_buf[8192] = {0}; @@ -45,6 +52,7 @@ static char sysroot_path[LINUX_PATH_MAX] = {0}; * the snprintf input buffer underneath that thread. */ static pthread_mutex_t sysroot_lock = PTHREAD_MUTEX_INITIALIZER; +static bool sysroot_casefold = false; /* Cached current working directory for getcwd() and /proc/self/cwd. */ static pthread_mutex_t cwd_lock = PTHREAD_MUTEX_INITIALIZER; @@ -62,6 +70,7 @@ void proc_state_init(void) environ_len = 0; auxv_len = 0; sysroot_path[0] = '\0'; + sysroot_casefold = false; pthread_mutex_lock(&cwd_lock); cwd_path[0] = '\0'; @@ -80,7 +89,7 @@ int proc_cwd_refresh(void) char sr[LINUX_PATH_MAX]; if (proc_sysroot_snapshot(sr, sizeof(sr))) { size_t sr_len = strlen(sr); - if (strncmp(cwd, sr, sr_len) == 0 && + if (!strncmp(cwd, sr, sr_len) && (cwd[sr_len] == '\0' || cwd[sr_len] == '/')) { guest_cwd = cwd + sr_len; if (*guest_cwd == '\0') @@ -238,6 +247,7 @@ unsigned int proc_get_shim_size(void) void proc_set_elf_path(const char *path) { + pthread_mutex_lock(&elf_path_lock); if (path) { if (path[0] == '/') { str_copy_trunc(elf_path, path, sizeof(elf_path)); @@ -251,13 +261,38 @@ void proc_set_elf_path(const char *path) } else { elf_path[0] = '\0'; } + pthread_mutex_unlock(&elf_path_lock); } +/* Returns the elf_path buffer pointer. Boolean-test callers tolerate the + * racy read since the first byte transitions atomically. Callers that + * consume the string content must use proc_elf_path_snapshot() to avoid + * a torn read against a concurrent proc_set_elf_path(). + */ const char *proc_get_elf_path(void) { return elf_path[0] ? elf_path : NULL; } +bool proc_elf_path_snapshot(char *out, size_t outsz) +{ + if (!out || outsz == 0) + return false; + pthread_mutex_lock(&elf_path_lock); + bool ok = false; + if (elf_path[0]) { + size_t need = strlen(elf_path) + 1; + if (need <= outsz) { + memcpy(out, elf_path, need); + ok = true; + } + } + pthread_mutex_unlock(&elf_path_lock); + if (!ok) + out[0] = '\0'; + return ok; +} + void proc_set_elfuse_path(const char *path) { if (path) @@ -390,6 +425,22 @@ bool proc_sysroot_snapshot(char *out, size_t outsz) return ok; } +void proc_set_sysroot_casefold(bool enabled) +{ + pthread_mutex_lock(&sysroot_lock); + sysroot_casefold = enabled; + pthread_mutex_unlock(&sysroot_lock); +} + +bool proc_sysroot_casefold_enabled(void) +{ + bool enabled; + pthread_mutex_lock(&sysroot_lock); + enabled = sysroot_casefold; + pthread_mutex_unlock(&sysroot_lock); + return enabled; +} + /* Confirm resolved_path canonicalizes inside sysroot. This is a check-then-use * sequence: callers issue the actual syscall after this returns, so a symlink * swapped in between will not be re-validated. openat2 @@ -410,28 +461,40 @@ static bool sysroot_path_is_contained(const char *resolved_path, if (!realpath(resolved_path, real_path)) return false; } else { - char parent[LINUX_PATH_MAX]; - char *slash; - - str_copy_trunc(parent, resolved_path, sizeof(parent)); - slash = strrchr(parent, '/'); - /* resolved_path is always ${sysroot}${guest_path} where sysroot is - * non-empty (caller short-circuits otherwise) and guest_path starts - * with '/'. The result therefore contains at least two '/' bytes, so - * the basename slash is never at parent[0]. Reject anything that - * violates the invariant rather than carrying dead code for it. + const char *base = strrchr(resolved_path, '/'); + /* "." and ".." basenames navigate the directory tree and cannot + * themselves be symlinks, so realpath the full path: appending + * the literal basename onto realpath(parent) would let + * "${sysroot}/x/.." pass the prefix check while the kernel + * resolves the syscall target above sysroot. */ - if (!slash || slash == parent) - return false; - - *slash = '\0'; - if (!realpath(parent, real_path)) - return false; - size_t parent_len = strlen(real_path); - if (snprintf(real_path + parent_len, sizeof(real_path) - parent_len, - "/%s", - slash + 1) >= (int) (sizeof(real_path) - parent_len)) { - return false; + if (base && (!strcmp(base + 1, "..") || !strcmp(base + 1, "."))) { + if (!realpath(resolved_path, real_path)) + return false; + } else { + char parent[LINUX_PATH_MAX]; + char *slash; + + str_copy_trunc(parent, resolved_path, sizeof(parent)); + slash = strrchr(parent, '/'); + /* resolved_path is always ${sysroot}${guest_path} where sysroot + * is non-empty (caller short-circuits otherwise) and guest_path + * starts with '/'. The result therefore contains at least two + * '/' bytes, so the basename slash is never at parent[0]. + * Reject anything that violates the invariant rather than + * carrying dead code for it. + */ + if (!slash || slash == parent) + return false; + + *slash = '\0'; + if (!realpath(parent, real_path)) + return false; + size_t parent_len = strlen(real_path); + if (snprintf(real_path + parent_len, sizeof(real_path) - parent_len, + "/%s", + slash + 1) >= (int) (sizeof(real_path) - parent_len)) + return false; } } @@ -467,20 +530,31 @@ static const char *proc_resolve_sysroot_path_flags(const char *path, char sr[LINUX_PATH_MAX]; if (!proc_sysroot_snapshot(sr, sizeof(sr)) || !path || path[0] != '/') return path; - if (bufsz == 0) + if (bufsz == 0) { + errno = ENAMETOOLONG; return NULL; + } int n = snprintf(buf, bufsz, "%s%s", sr, path); - if (n < 0) + if (n < 0) { + if (errno == 0) + errno = EINVAL; return NULL; + } bool full_path_truncated = (size_t) n >= bufsz; if (!full_path_truncated && sysroot_path_exists(buf, follow_final)) { - if (!sysroot_path_is_contained(buf, sr, follow_final)) + if (!sysroot_path_is_contained(buf, sr, follow_final)) { + errno = ELOOP; return NULL; + } return buf; } - return full_path_truncated ? NULL : path; + if (full_path_truncated) { + errno = ENAMETOOLONG; + return NULL; + } + return path; } const char *proc_resolve_sysroot_path(const char *path, char *buf, size_t bufsz) @@ -497,28 +571,69 @@ const char *proc_resolve_sysroot_nofollow_path(const char *path, const char *proc_resolve_sysroot_create_path(const char *path, char *buf, - size_t bufsz) + size_t bufsz, + bool create_parents) { char sr[LINUX_PATH_MAX]; if (!proc_sysroot_snapshot(sr, sizeof(sr)) || !path || path[0] != '/') return path; - if (bufsz == 0) + if (bufsz == 0) { + errno = ENAMETOOLONG; return NULL; + } int n = snprintf(buf, bufsz, "%s%s", sr, path); - if (n < 0 || (size_t) n >= bufsz) + if (n < 0) { + if (errno == 0) + errno = EINVAL; return NULL; + } + if ((size_t) n >= bufsz) { + errno = ENAMETOOLONG; + return NULL; + } char parent[LINUX_PATH_MAX]; str_copy_trunc(parent, buf, sizeof(parent)); char *slash = strrchr(parent, '/'); - if (slash && slash != parent) { - *slash = '\0'; - if (access(parent, F_OK) == 0 && - !sysroot_path_is_contained(parent, sr, true)) { + if (!slash || slash == parent) + return buf; + + *slash = '\0'; + if (access(parent, F_OK) == 0) { + if (!sysroot_path_is_contained(parent, sr, true)) { + errno = ELOOP; return NULL; } + return buf; } + /* access() failed for a reason other than "parent missing" (e.g. EACCES, + * ELOOP, ENAMETOOLONG, EIO). Treating those as "parent absent" would let + * the redirect logic auto-create or silently fall back to the host + * literal, which can bypass sysroot resolution. Surface the real error. + */ + if (errno != ENOENT && errno != ENOTDIR) + return NULL; + + /* Parent doesn't exist in sysroot. Only /tmp, /var/tmp, and ccache get + * forcefully redirected to the sysroot to avoid host case-collisions; + * everything else falls back to the host literal. + */ + if (strncmp(path, "/tmp/", 5) && strncmp(path, "/var/tmp/", 9) && + !strstr(path, "/.ccache/")) + return path; + + if (!create_parents) { + if (sysroot_validate_dir_prefix(parent) < 0) + return NULL; + return buf; + } + if (sysroot_ensure_dir_exists(parent) < 0) + return NULL; + if (!sysroot_path_is_contained(parent, sr, true)) { + errno = ELOOP; + return NULL; + } return buf; } diff --git a/src/syscall/proc.h b/src/syscall/proc.h index 6cf2d9a..3b5cc94 100644 --- a/src/syscall/proc.h +++ b/src/syscall/proc.h @@ -62,9 +62,18 @@ unsigned int proc_get_shim_size(void); */ void proc_set_elf_path(const char *path); -/* Get the stored ELF binary path. Returns NULL if not set. */ +/* Get the stored ELF binary path. Returns NULL if not set. The returned + * pointer references shared mutable state and is safe only for boolean + * tests; callers that consume the string must use proc_elf_path_snapshot. + */ const char *proc_get_elf_path(void); +/* Copy the stored ELF binary path into out. Returns true on success, false + * if no path is set or outsz is too small. Locked against concurrent + * proc_set_elf_path() so the returned content is consistent. + */ +bool proc_elf_path_snapshot(char *out, size_t outsz); + /* Store the absolute path of the elfuse binary itself. Used to spawn * fork/clone children. Set once at startup via _NSGetExecutablePath(). */ @@ -204,6 +213,8 @@ const char *proc_get_sysroot(void); * case out[0] is set to '\0' when possible). */ bool proc_sysroot_snapshot(char *out, size_t outsz); +void proc_set_sysroot_casefold(bool enabled); +bool proc_sysroot_casefold_enabled(void); /* Resolve an absolute guest path through the stored sysroot. Returns path * unchanged when no sysroot applies or when the sysroot-backed path does not @@ -228,7 +239,8 @@ const char *proc_resolve_sysroot_nofollow_path(const char *path, */ const char *proc_resolve_sysroot_create_path(const char *path, char *buf, - size_t bufsz); + size_t bufsz, + bool create_parents); /* execve. */ diff --git a/src/syscall/sidecar.c b/src/syscall/sidecar.c new file mode 100644 index 0000000..01fcf93 --- /dev/null +++ b/src/syscall/sidecar.c @@ -0,0 +1,1880 @@ +/* Case-folding fallback VFS helpers + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#include "syscall/abi.h" +#include "syscall/internal.h" +#include "syscall/path.h" +#include "syscall/proc.h" +#include "syscall/sidecar.h" + +#ifndef LINUX_RENAME_NOREPLACE +#define LINUX_RENAME_NOREPLACE (1 << 0) +#endif +#ifndef LINUX_RENAME_EXCHANGE +#define LINUX_RENAME_EXCHANGE (1 << 1) +#endif + +#define SIDECAR_INDEX_TMP_NAME SIDECAR_INDEX_NAME ".tmp" +#define SIDECAR_INDEX_LOCK_NAME SIDECAR_INDEX_NAME ".lock" + +/* fcntl POSIX advisory locks are per-process. Within a single elfuse instance, + * multiple vCPU threads all "hold" the same fcntl lock simultaneously. This + * mutex serializes index updates across vCPU threads; the fcntl lock on the + * dedicated lock sentinel still serializes against forked elfuse processes + * that share the same sysroot. + */ +static pthread_mutex_t sidecar_global_lock = PTHREAD_MUTEX_INITIALIZER; + +typedef struct { + char *guest_name; + char token[SIDECAR_TOKEN_NAME_LEN + 1]; +} sidecar_row_t; + +typedef struct { + sidecar_row_t *rows; + size_t count; +} sidecar_index_t; + +typedef struct { + host_fd_t dirfd; + bool absolute; + char basename[NAME_MAX + 1]; +} sidecar_parent_t; + +static void sidecar_index_free(sidecar_index_t *index) +{ + if (!index) + return; + for (size_t i = 0; i < index->count; i++) + free(index->rows[i].guest_name); + free(index->rows); + index->rows = NULL; + index->count = 0; +} + +/* Deep-copy src into dst so the caller can mutate dst freely and still + * recover src for rollback. Returns 0 on success, -1 with errno on alloc + * failure (dst is left empty in that case). + */ +static int sidecar_index_clone(const sidecar_index_t *src, sidecar_index_t *dst) +{ + dst->rows = NULL; + dst->count = 0; + if (src->count == 0) + return 0; + dst->rows = (sidecar_row_t *) malloc(src->count * sizeof(sidecar_row_t)); + if (!dst->rows) + return -1; + for (size_t i = 0; i < src->count; i++) { + dst->rows[i].guest_name = strdup(src->rows[i].guest_name); + if (!dst->rows[i].guest_name) { + dst->count = i; + sidecar_index_free(dst); + return -1; + } + memcpy(dst->rows[i].token, src->rows[i].token, + sizeof(dst->rows[i].token)); + } + dst->count = src->count; + return 0; +} + +bool sidecar_active(void) +{ + return proc_get_sysroot() && proc_sysroot_casefold_enabled(); +} + +bool sidecar_name_reserved(const char *name) +{ + return name && (!strcmp(name, SIDECAR_INDEX_NAME) || + !strcmp(name, SIDECAR_INDEX_TMP_NAME) || + !strcmp(name, SIDECAR_INDEX_LOCK_NAME)); +} + +bool sidecar_path_targets_reserved_name(const char *path) +{ + if (!path || path[0] == '\0') + return false; + + const char *basename = strrchr(path, '/'); + basename = basename ? basename + 1 : path; + return sidecar_name_reserved(basename); +} + +static int hex_nibble(unsigned char c) +{ + if (c >= '0' && c <= '9') + return (int) (c - '0'); + if (c >= 'a' && c <= 'f') + return (int) (c - 'a' + 10); + if (c >= 'A' && c <= 'F') + return (int) (c - 'A' + 10); + return -1; +} + +static int sidecar_decode_name(const char *hex, char **out) +{ + size_t len = strlen(hex); + if ((len & 1u) != 0) { + errno = EPROTO; + return -1; + } + + char *name = (char *) malloc(len / 2 + 1); + if (!name) + return -1; + + for (size_t i = 0; i < len; i += 2) { + int hi = hex_nibble((unsigned char) hex[i]); + int lo = hex_nibble((unsigned char) hex[i + 1]); + if (hi < 0 || lo < 0) { + free(name); + errno = EPROTO; + return -1; + } + name[i / 2] = (char) ((hi << 4) | lo); + } + name[len / 2] = '\0'; + *out = name; + return 0; +} + +static int sidecar_load_index(int dirfd, sidecar_index_t *index) +{ + memset(index, 0, sizeof(*index)); + + int fd = openat(dirfd, SIDECAR_INDEX_NAME, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + return -1; + } + if (st.st_size == 0) { + close(fd); + return 0; + } + if (st.st_size < 0 || st.st_size >= (off_t) LINUX_PATH_MAX * 64) { + close(fd); + errno = EFBIG; + return -1; + } + + size_t size = (size_t) st.st_size; + char *buf = (char *) malloc(size + 1); + if (!buf) { + close(fd); + return -1; + } + + size_t off = 0; + while (off < size) { + ssize_t n = read(fd, buf + off, size - off); + if (n < 0) { + if (errno == EINTR) + continue; + free(buf); + close(fd); + return -1; + } + if (n == 0) + break; + off += (size_t) n; + } + close(fd); + buf[off] = '\0'; + + char *line = buf; + while (*line) { + char *newline = strchr(line, '\n'); + if (newline) + *newline = '\0'; + + if (*line != '\0') { + char *tab = strchr(line, '\t'); + if (!tab || tab == line || tab[1] == '\0') { + free(buf); + sidecar_index_free(index); + errno = EPROTO; + return -1; + } + *tab = '\0'; + + sidecar_row_t *rows = (sidecar_row_t *) realloc( + index->rows, (index->count + 1) * sizeof(sidecar_row_t)); + if (!rows) { + free(buf); + sidecar_index_free(index); + return -1; + } + index->rows = rows; + if (sidecar_decode_name( + line, &index->rows[index->count].guest_name) < 0) { + free(buf); + sidecar_index_free(index); + return -1; + } + if (strlen(tab + 1) != SIDECAR_TOKEN_NAME_LEN) { + free(buf); + sidecar_index_free(index); + errno = EPROTO; + return -1; + } + memcpy(index->rows[index->count].token, tab + 1, + SIDECAR_TOKEN_NAME_LEN + 1); + index->count++; + } + + if (!newline) + break; + line = newline + 1; + } + + free(buf); + return 0; +} + +static const char *sidecar_lookup_guest(const sidecar_index_t *index, + const char *guest_name) +{ + for (size_t i = 0; i < index->count; i++) { + if (!strcmp(index->rows[i].guest_name, guest_name)) + return index->rows[i].token; + } + return NULL; +} + +static const char *sidecar_lookup_token(const sidecar_index_t *index, + const char *token) +{ + for (size_t i = 0; i < index->count; i++) { + if (!strcmp(index->rows[i].token, token)) + return index->rows[i].guest_name; + } + return NULL; +} + +static int sidecar_next_component(const char **pathp, + const char **comp, + size_t *len) +{ + const char *p = *pathp; + while (*p == '/') + p++; + if (*p == '\0') { + *pathp = p; + return 0; + } + *comp = p; + while (*p != '\0' && *p != '/') + p++; + *len = (size_t) (p - *comp); + *pathp = p; + return 1; +} + +static int sidecar_append_component(char *out, + size_t outsz, + size_t *len_io, + const char *comp, + bool absolute) +{ + size_t len = *len_io; + size_t comp_len = strlen(comp); + + if (absolute) { + if (len == 0 || out[len - 1] != '/') { + if (len + 1 >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + out[len++] = '/'; + } + } else if (len != 0) { + if (len + 1 >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + out[len++] = '/'; + } + + if (len + comp_len >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(out + len, comp, comp_len); + len += comp_len; + out[len] = '\0'; + *len_io = len; + return 0; +} + +static int sidecar_open_base(guest_fd_t dirfd, + const char *path, + char *out, + size_t outsz, + host_fd_t *base_fd, + bool *absolute) +{ + out[0] = '\0'; + *absolute = false; + + if (path[0] == '/') { + char sysroot[LINUX_PATH_MAX]; + if (!proc_sysroot_snapshot(sysroot, sizeof(sysroot))) { + errno = ENOENT; + return -1; + } + size_t len = str_copy_trunc(out, sysroot, outsz); + if (len >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + *base_fd = open(sysroot, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (*base_fd < 0) + return -1; + *absolute = true; + return 0; + } + + if (dirfd == LINUX_AT_FDCWD) { + *base_fd = open(".", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + return *base_fd < 0 ? -1 : 0; + } + + host_fd_ref_t ref; + if (host_dirfd_ref_open(dirfd, &ref) < 0) { + errno = EBADF; + return -1; + } + *base_fd = dup(ref.fd); + host_fd_ref_close(&ref); + return *base_fd < 0 ? -1 : 0; +} + +int sidecar_translate_lookup_at(guest_fd_t dirfd, + const char *path, + char *out, + size_t outsz) +{ + if (!sidecar_active() || !path) + return 0; + if (path[0] == '\0') + return 0; + + char normalized[LINUX_PATH_MAX]; + const char *scan = path; + if (path[0] == '/') { + if (path_openat2_normalize_in_root(path, normalized, + sizeof(normalized)) < 0) { + errno = ENAMETOOLONG; + return -1; + } + scan = normalized; + + /* Kernel virtual filesystems live in procemu, not on the sysroot + * disk tree. Walking them here would openat() against a directory + * that never exists in the sysroot and short-circuit the procemu + * intercept downstream of path_translate_at(). Punt to that layer + * instead. Check the normalized form so "/./proc/..." and + * "//proc/..." also skip; match only on a full top-level component + * so siblings like "/procfoo" still go through sidecar. Note that + * path_openat2_normalize_in_root() strips the leading '/' from + * absolute inputs, so the prefixes here are unrooted. + */ + size_t plen = 0; + if (!strncmp(normalized, "proc", 4)) + plen = 4; + else if (!strncmp(normalized, "sys", 3)) + plen = 3; + else if (!strncmp(normalized, "dev", 3)) + plen = 3; + if (plen && (normalized[plen] == '\0' || normalized[plen] == '/')) + return 0; + } + + host_fd_t cur_fd = -1; + bool absolute = false; + if (sidecar_open_base(dirfd, path, out, outsz, &cur_fd, &absolute) < 0) + return -1; + + size_t out_len = strlen(out); + const char *comp; + size_t comp_len; + while (sidecar_next_component(&scan, &comp, &comp_len)) { + char guest_comp[NAME_MAX + 1]; + if (comp_len >= sizeof(guest_comp)) { + close(cur_fd); + errno = ENAMETOOLONG; + return -1; + } + memcpy(guest_comp, comp, comp_len); + guest_comp[comp_len] = '\0'; + + if (sidecar_name_reserved(guest_comp)) { + close(cur_fd); + errno = ENOENT; + return -1; + } + if (!strcmp(guest_comp, ".") || !strcmp(guest_comp, "..")) { + if (sidecar_append_component(out, outsz, &out_len, guest_comp, + absolute) < 0) { + close(cur_fd); + return -1; + } + if (strcmp(guest_comp, ".")) { + int next_fd = openat(cur_fd, guest_comp, + O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (next_fd < 0) { + close(cur_fd); + return -1; + } + close(cur_fd); + cur_fd = next_fd; + } + continue; + } + + sidecar_index_t index; + if (sidecar_load_index(cur_fd, &index) < 0) { + close(cur_fd); + return -1; + } + const char *mapped = sidecar_lookup_guest(&index, guest_comp); + char host_comp[NAME_MAX + 1]; + if (mapped) + str_copy_trunc(host_comp, mapped, sizeof(host_comp)); + else + str_copy_trunc(host_comp, guest_comp, sizeof(host_comp)); + + if (sidecar_append_component(out, outsz, &out_len, host_comp, + absolute) < 0) { + sidecar_index_free(&index); + close(cur_fd); + return -1; + } + sidecar_index_free(&index); + + const char *peek = scan; + while (*peek == '/') + peek++; + if (*peek == '\0') + break; + + int next_fd = + openat(cur_fd, host_comp, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (next_fd < 0) { + close(cur_fd); + return -1; + } + close(cur_fd); + cur_fd = next_fd; + } + + close(cur_fd); + return 1; +} + +int sidecar_translate_dirent_name(guest_fd_t dirfd, + const char *host_name, + char *guest_name, + size_t guest_name_sz) +{ + if (!sidecar_active()) + return 0; + if (sidecar_name_reserved(host_name)) + return 1; + + host_fd_ref_t ref; + if (host_fd_ref_open(dirfd, &ref) < 0) { + errno = EBADF; + return -1; + } + + sidecar_index_t index; + int rc = sidecar_load_index(ref.fd, &index); + host_fd_ref_close(&ref); + if (rc < 0) + return -1; + + const char *guest = sidecar_lookup_token(&index, host_name); + if (!guest) { + sidecar_index_free(&index); + return 0; + } + + size_t len = strlen(guest); + if (len + 1 > guest_name_sz) { + sidecar_index_free(&index); + errno = ENAMETOOLONG; + return -1; + } + memcpy(guest_name, guest, len + 1); + sidecar_index_free(&index); + return 0; +} +static int sidecar_encode_name(const char *name, char **out) +{ + static const char hex[] = "0123456789abcdef"; + size_t len = strlen(name); + char *buf = (char *) malloc(len * 2 + 1); + if (!buf) + return -1; + for (size_t i = 0; i < len; i++) { + unsigned char c = (unsigned char) name[i]; + buf[i * 2] = hex[c >> 4]; + buf[i * 2 + 1] = hex[c & 0x0f]; + } + buf[len * 2] = '\0'; + *out = buf; + return 0; +} + +static int sidecar_exact_name_exists(int dirfd, const char *name) +{ + int dup_fd = dup(dirfd); + if (dup_fd < 0) + return -1; + + DIR *dir = fdopendir(dup_fd); + if (!dir) { + close(dup_fd); + return -1; + } + + int found = 0; + struct dirent *de; + while ((de = readdir(dir)) != NULL) { + if (!strcmp(de->d_name, name)) { + found = 1; + break; + } + } + closedir(dir); + return found; +} + +static ssize_t sidecar_find_guest_index(const sidecar_index_t *index, + const char *guest_name) +{ + for (size_t i = 0; i < index->count; i++) { + if (!strcmp(index->rows[i].guest_name, guest_name)) + return (ssize_t) i; + } + return -1; +} + +/* fcntl-only lock acquisition. Caller must hold sidecar_global_lock so that + * the lock_two_indices nested path does not need a recursive mutex. + */ +static int sidecar_lock_index_fcntl(int dirfd, int *lock_fd) +{ + *lock_fd = openat(dirfd, SIDECAR_INDEX_LOCK_NAME, + O_RDWR | O_CREAT | O_CLOEXEC, 0644); + if (*lock_fd < 0) + return -1; + + struct flock fl = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + while (fcntl(*lock_fd, F_SETLKW, &fl) < 0) { + if (errno != EINTR) { + int saved_errno = errno; + close(*lock_fd); + *lock_fd = -1; + errno = saved_errno; + return -1; + } + } + return 0; +} + +static void sidecar_unlock_index_fcntl(int lock_fd) +{ + if (lock_fd < 0) + return; + struct flock fl = { + .l_type = F_UNLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + (void) fcntl(lock_fd, F_SETLK, &fl); + close(lock_fd); +} + +static int sidecar_lock_index(int dirfd, int *lock_fd) +{ + pthread_mutex_lock(&sidecar_global_lock); + if (sidecar_lock_index_fcntl(dirfd, lock_fd) < 0) { + int saved_errno = errno; + pthread_mutex_unlock(&sidecar_global_lock); + errno = saved_errno; + return -1; + } + return 0; +} + +static void sidecar_unlock_index(int lock_fd) +{ + sidecar_unlock_index_fcntl(lock_fd); + pthread_mutex_unlock(&sidecar_global_lock); +} + +static int sidecar_load_locked_index(int parent_dirfd, + int lock_fd, + sidecar_index_t *index) +{ + (void) lock_fd; + memset(index, 0, sizeof(*index)); + + int fd = openat(parent_dirfd, SIDECAR_INDEX_NAME, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + return -1; + } + if (st.st_size == 0) { + close(fd); + return 0; + } + if (st.st_size < 0 || st.st_size >= (off_t) LINUX_PATH_MAX * 64) { + close(fd); + errno = EFBIG; + return -1; + } + + size_t size = (size_t) st.st_size; + char *buf = (char *) malloc(size + 1); + if (!buf) { + close(fd); + return -1; + } + if (lseek(fd, 0, SEEK_SET) < 0) { + int saved_errno = errno; + close(fd); + free(buf); + errno = saved_errno; + return -1; + } + + size_t off = 0; + while (off < size) { + ssize_t n = read(fd, buf + off, size - off); + if (n < 0) { + if (errno == EINTR) + continue; + int saved_errno = errno; + close(fd); + free(buf); + errno = saved_errno; + return -1; + } + if (n == 0) + break; + off += (size_t) n; + } + buf[off] = '\0'; + + char *line = buf; + while (*line) { + char *newline = strchr(line, '\n'); + if (newline) + *newline = '\0'; + if (*line != '\0') { + char *tab = strchr(line, '\t'); + if (!tab || tab == line || tab[1] == '\0') { + close(fd); + free(buf); + sidecar_index_free(index); + errno = EPROTO; + return -1; + } + *tab = '\0'; + sidecar_row_t *rows = (sidecar_row_t *) realloc( + index->rows, (index->count + 1) * sizeof(sidecar_row_t)); + if (!rows) { + int saved_errno = errno; + close(fd); + free(buf); + sidecar_index_free(index); + errno = saved_errno; + return -1; + } + index->rows = rows; + if (sidecar_decode_name( + line, &index->rows[index->count].guest_name) < 0) { + int saved_errno = errno; + close(fd); + free(buf); + sidecar_index_free(index); + errno = saved_errno; + return -1; + } + if (strlen(tab + 1) != SIDECAR_TOKEN_NAME_LEN) { + close(fd); + free(buf); + sidecar_index_free(index); + errno = EPROTO; + return -1; + } + memcpy(index->rows[index->count].token, tab + 1, + SIDECAR_TOKEN_NAME_LEN + 1); + index->count++; + } + if (!newline) + break; + line = newline + 1; + } + + if (close(fd) < 0) { + int saved_errno = errno; + free(buf); + sidecar_index_free(index); + errno = saved_errno; + return -1; + } + free(buf); + return 0; +} + +/* Write all bytes or fail. Returns 0 on success, -1 with errno set on error. + * Handles short writes by retrying until everything is committed. + */ +static int sidecar_write_all(int fd, const char *buf, size_t len) +{ + size_t off = 0; + while (off < len) { + ssize_t n = write(fd, buf + off, len - off); + if (n < 0) { + if (errno == EINTR) + continue; + return -1; + } + if (n == 0) { + errno = EIO; + return -1; + } + off += (size_t) n; + } + return 0; +} + +/* Serialize the index into a malloc'd buffer. *out_len receives the byte + * count. Returns 0 on success, -1 with errno on error; *out is NULL on + * failure. + */ +static int sidecar_serialize_index(const sidecar_index_t *index, + char **out, + size_t *out_len) +{ + *out = NULL; + *out_len = 0; + + /* Estimate capacity: each row is enc(name) + '\t' + token + '\n'. + * enc is at most 2 * NAME_MAX (hex encoding) plus a null. Round up. + */ + size_t cap = 256; + char *buf = (char *) malloc(cap); + if (!buf) + return -1; + size_t len = 0; + + for (size_t i = 0; i < index->count; i++) { + char *enc = NULL; + if (sidecar_encode_name(index->rows[i].guest_name, &enc) < 0) { + int saved_errno = errno; + free(buf); + errno = saved_errno; + return -1; + } + size_t enc_len = strlen(enc); + size_t row_len = enc_len + 1 + SIDECAR_TOKEN_NAME_LEN + 1; + if (len + row_len > cap) { + size_t new_cap = cap; + while (new_cap < len + row_len) + new_cap *= 2; + char *nb = (char *) realloc(buf, new_cap); + if (!nb) { + int saved_errno = errno; + free(enc); + free(buf); + errno = saved_errno; + return -1; + } + buf = nb; + cap = new_cap; + } + memcpy(buf + len, enc, enc_len); + len += enc_len; + buf[len++] = '\t'; + memcpy(buf + len, index->rows[i].token, SIDECAR_TOKEN_NAME_LEN); + len += SIDECAR_TOKEN_NAME_LEN; + buf[len++] = '\n'; + free(enc); + } + + *out = buf; + *out_len = len; + return 0; +} + +/* Write the index atomically: serialize into memory, write to a tmp file + * adjacent to the real index, then renameat() over the real index. The + * caller already holds a separate lock sentinel for cross-process + * serialization. + */ +static int sidecar_write_locked_index(int parent_dirfd, + int lock_fd, + const sidecar_index_t *index) +{ + (void) lock_fd; + + char *payload = NULL; + size_t payload_len = 0; + if (sidecar_serialize_index(index, &payload, &payload_len) < 0) + return -1; + + int tmp_fd = openat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, + O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644); + if (tmp_fd < 0) { + int saved_errno = errno; + free(payload); + errno = saved_errno; + return -1; + } + if (payload_len > 0 && + sidecar_write_all(tmp_fd, payload, payload_len) < 0) { + int saved_errno = errno; + close(tmp_fd); + (void) unlinkat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, 0); + free(payload); + errno = saved_errno; + return -1; + } + if (close(tmp_fd) < 0) { + int saved_errno = errno; + (void) unlinkat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, 0); + free(payload); + errno = saved_errno; + return -1; + } + if (renameat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, parent_dirfd, + SIDECAR_INDEX_NAME) < 0) { + int saved_errno = errno; + (void) unlinkat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, 0); + free(payload); + errno = saved_errno; + return -1; + } + free(payload); + return 0; +} + +static int sidecar_remove_guest_locked(sidecar_index_t *index, + size_t remove_idx) +{ + if (remove_idx >= index->count) { + errno = ENOENT; + return -1; + } + + free(index->rows[remove_idx].guest_name); + if (remove_idx + 1 < index->count) { + memmove(&index->rows[remove_idx], &index->rows[remove_idx + 1], + (index->count - remove_idx - 1) * sizeof(sidecar_row_t)); + } + index->count--; + return 0; +} + +static int sidecar_append_guest_locked(sidecar_index_t *index, + const char *guest_name, + const char *token) +{ + sidecar_row_t *rows = (sidecar_row_t *) realloc( + index->rows, (index->count + 1) * sizeof(sidecar_row_t)); + if (!rows) + return -1; + + index->rows = rows; + index->rows[index->count].guest_name = strdup(guest_name); + if (!index->rows[index->count].guest_name) + return -1; + memcpy(index->rows[index->count].token, token, SIDECAR_TOKEN_NAME_LEN + 1); + index->count++; + return 0; +} + +static int sidecar_generate_token(char token[SIDECAR_TOKEN_NAME_LEN + 1]) +{ + uint64_t rnd = (((uint64_t) arc4random()) << 32) | arc4random(); + int n = snprintf(token, SIDECAR_TOKEN_NAME_LEN + 1, "%s%016llx", + SIDECAR_TOKEN_PREFIX, (unsigned long long) rnd); + if (n != SIDECAR_TOKEN_NAME_LEN) { + errno = EINVAL; + return -1; + } + return 0; +} + +static int sidecar_walk_parent_at(guest_fd_t dirfd, + const char *path, + sidecar_parent_t *parent) +{ + memset(parent, 0, sizeof(*parent)); + if (!path || path[0] == '\0') { + errno = ENOENT; + return -1; + } + + char normalized[LINUX_PATH_MAX]; + char work[LINUX_PATH_MAX]; + if (path[0] == '/') { + if (path_openat2_normalize_in_root(path, normalized, + sizeof(normalized)) < 0) { + errno = ENAMETOOLONG; + return -1; + } + str_copy_trunc(work, normalized, sizeof(work)); + parent->absolute = true; + } else { + str_copy_trunc(work, path, sizeof(work)); + } + + char *slash = strrchr(work, '/'); + const char *basename = slash ? slash + 1 : work; + if (*basename == '\0') { + errno = ENOENT; + return -1; + } + if (strlen(basename) >= sizeof(parent->basename)) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(parent->basename, basename, strlen(basename) + 1); + if (sidecar_name_reserved(parent->basename)) { + errno = ENOENT; + return -1; + } + + if (slash) + *slash = '\0'; + else + str_copy_trunc(work, ".", sizeof(work)); + + if (!strcmp(work, ".")) { + bool absolute = false; + return sidecar_open_base(dirfd, path, normalized, sizeof(normalized), + &parent->dirfd, &absolute); + } + + char guest_parent[LINUX_PATH_MAX]; + if (path[0] == '/') { + if (!strcmp(work, ".")) + str_copy_trunc(guest_parent, "/", sizeof(guest_parent)); + else if (snprintf(guest_parent, sizeof(guest_parent), "/%s", work) >= + (int) sizeof(guest_parent)) { + errno = ENAMETOOLONG; + return -1; + } + } else { + str_copy_trunc(guest_parent, work, sizeof(guest_parent)); + } + + char host_parent[LINUX_PATH_MAX]; + int rc = sidecar_translate_lookup_at(dirfd, guest_parent, host_parent, + sizeof(host_parent)); + if (rc < 0) + return -1; + if (rc > 0) { + if (path[0] == '/' || dirfd == LINUX_AT_FDCWD) { + parent->dirfd = + open(host_parent, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + } else { + host_fd_ref_t ref; + if (host_dirfd_ref_open(dirfd, &ref) < 0) { + errno = EBADF; + return -1; + } + parent->dirfd = + openat(ref.fd, host_parent, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + host_fd_ref_close(&ref); + } + } else if (path[0] == '/') { + char sysroot[LINUX_PATH_MAX]; + if (!proc_sysroot_snapshot(sysroot, sizeof(sysroot))) { + errno = ENOENT; + return -1; + } + if (snprintf(host_parent, sizeof(host_parent), "%s/%s", sysroot, + work) >= (int) sizeof(host_parent)) { + errno = ENAMETOOLONG; + return -1; + } + parent->dirfd = open(host_parent, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + } else if (dirfd == LINUX_AT_FDCWD) { + parent->dirfd = open(work, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + } else { + host_fd_ref_t ref; + if (host_dirfd_ref_open(dirfd, &ref) < 0) { + errno = EBADF; + return -1; + } + parent->dirfd = + openat(ref.fd, work, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + host_fd_ref_close(&ref); + } + + return parent->dirfd < 0 ? -1 : 0; +} + +static void sidecar_parent_close(sidecar_parent_t *parent) +{ + if (parent && parent->dirfd >= 0) { + close(parent->dirfd); + parent->dirfd = -1; + } +} + +static int sidecar_parent_stat(sidecar_parent_t *parent, struct stat *st) +{ + if (fstat(parent->dirfd, st) < 0) { + sidecar_parent_close(parent); + return -1; + } + return 0; +} + +static const char *sidecar_existing_name_locked(const sidecar_index_t *index, + int dirfd, + const char *guest_name) +{ + const char *mapped = sidecar_lookup_guest(index, guest_name); + if (mapped) + return mapped; + return sidecar_exact_name_exists(dirfd, guest_name) == 1 ? guest_name + : NULL; +} + +int sidecar_openat(guest_fd_t dirfd, + const char *path, + int linux_flags, + mode_t mode) +{ + if (!sidecar_active() || !(linux_flags & LINUX_O_CREAT)) + return (int) SIDECAR_NOT_HANDLED; + + sidecar_parent_t parent; + if (sidecar_walk_parent_at(dirfd, path, &parent) < 0) + return -1; + if (sidecar_name_reserved(parent.basename)) { + sidecar_parent_close(&parent); + errno = ENOENT; + return -1; + } + + int lock_fd = -1; + if (sidecar_lock_index(parent.dirfd, &lock_fd) < 0) { + sidecar_parent_close(&parent); + return -1; + } + + sidecar_index_t index; + if (sidecar_load_locked_index(parent.dirfd, lock_fd, &index) < 0) { + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return -1; + } + + int mac_flags = translate_open_flags(linux_flags); + const char *existing = + sidecar_existing_name_locked(&index, parent.dirfd, parent.basename); + if (existing) { + if (linux_flags & LINUX_O_EXCL) { + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + errno = EEXIST; + return -1; + } + int fd = openat(parent.dirfd, existing, mac_flags, mode); + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return fd; + } + + int fd = -1; + char token[SIDECAR_TOKEN_NAME_LEN + 1]; + for (;;) { + if (sidecar_generate_token(token) < 0) + break; + fd = openat(parent.dirfd, token, mac_flags | O_EXCL, mode); + if (fd >= 0) + break; + if (errno != EEXIST) + break; + } + if (fd < 0) { + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return -1; + } + + sidecar_row_t *rows = (sidecar_row_t *) realloc( + index.rows, (index.count + 1) * sizeof(sidecar_row_t)); + if (!rows) { + int saved_errno = errno; + close(fd); + unlinkat(parent.dirfd, token, 0); + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + errno = saved_errno; + return -1; + } + index.rows = rows; + index.rows[index.count].guest_name = strdup(parent.basename); + memcpy(index.rows[index.count].token, token, sizeof(token)); + index.count++; + if (sidecar_write_locked_index(parent.dirfd, lock_fd, &index) < 0) { + int saved_errno = errno; + close(fd); + unlinkat(parent.dirfd, token, 0); + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + errno = saved_errno; + return -1; + } + + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return fd; +} + +int64_t sidecar_mkdirat(guest_fd_t dirfd, const char *path, mode_t mode) +{ + if (!sidecar_active()) + return SIDECAR_NOT_HANDLED; + + sidecar_parent_t parent; + if (sidecar_walk_parent_at(dirfd, path, &parent) < 0) + return linux_errno(); + + int lock_fd = -1; + if (sidecar_lock_index(parent.dirfd, &lock_fd) < 0) { + sidecar_parent_close(&parent); + return linux_errno(); + } + + sidecar_index_t index; + if (sidecar_load_locked_index(parent.dirfd, lock_fd, &index) < 0) { + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return linux_errno(); + } + if (sidecar_existing_name_locked(&index, parent.dirfd, parent.basename)) { + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return -LINUX_EEXIST; + } + + char token[SIDECAR_TOKEN_NAME_LEN + 1]; + for (;;) { + if (sidecar_generate_token(token) < 0) + break; + if (mkdirat(parent.dirfd, token, mode) == 0) + break; + if (errno != EEXIST) { + token[0] = '\0'; + break; + } + } + if (token[0] == '\0') { + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return linux_errno(); + } + + sidecar_row_t *rows = (sidecar_row_t *) realloc( + index.rows, (index.count + 1) * sizeof(sidecar_row_t)); + if (!rows) { + int saved_errno = errno; + unlinkat(parent.dirfd, token, AT_REMOVEDIR); + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + errno = saved_errno; + return linux_errno(); + } + index.rows = rows; + index.rows[index.count].guest_name = strdup(parent.basename); + memcpy(index.rows[index.count].token, token, sizeof(token)); + index.count++; + if (sidecar_write_locked_index(parent.dirfd, lock_fd, &index) < 0) { + int saved_errno = errno; + unlinkat(parent.dirfd, token, AT_REMOVEDIR); + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + errno = saved_errno; + return linux_errno(); + } + + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return 0; +} + +int64_t sidecar_unlinkat(guest_fd_t dirfd, const char *path, int flags) +{ + if (!sidecar_active()) + return SIDECAR_NOT_HANDLED; + + sidecar_parent_t parent; + if (sidecar_walk_parent_at(dirfd, path, &parent) < 0) + return linux_errno(); + + int lock_fd = -1; + if (sidecar_lock_index(parent.dirfd, &lock_fd) < 0) { + sidecar_parent_close(&parent); + return linux_errno(); + } + + sidecar_index_t index; + if (sidecar_load_locked_index(parent.dirfd, lock_fd, &index) < 0) { + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return linux_errno(); + } + + size_t remove_idx = index.count; + char host_name[SIDECAR_TOKEN_NAME_LEN + 1]; + bool have_host_name = false; + for (size_t i = 0; i < index.count; i++) { + if (!strcmp(index.rows[i].guest_name, parent.basename)) { + memcpy(host_name, index.rows[i].token, sizeof(host_name)); + remove_idx = i; + have_host_name = true; + break; + } + } + + int64_t rc = 0; + if (have_host_name) { + /* Write the index update first so that an interrupted unlinkat + * does not leave the on-disk token without a mapping. If the + * unlinkat fails, restore the mapping and rewrite the index; + * the second write going wrong is logged but cannot be helped. + */ + sidecar_row_t saved_row = index.rows[remove_idx]; + char *saved_name = strdup(saved_row.guest_name); + if (!saved_name) { + rc = linux_errno(); + /* No mutation happened yet, so reporting the allocation + * failure keeps the index and host state unchanged. + */ + } else { + sidecar_remove_guest_locked(&index, remove_idx); + if (sidecar_write_locked_index(parent.dirfd, lock_fd, &index) < 0) { + rc = linux_errno(); + free(saved_name); + /* No host mutation happened, in-memory index has the entry + * removed but on-disk still holds the original. That is + * consistent with the failure being reported to the guest. + */ + } else if (unlinkat(parent.dirfd, host_name, flags) < 0) { + int saved_errno = errno; + sidecar_row_t *rows = (sidecar_row_t *) realloc( + index.rows, (index.count + 1) * sizeof(sidecar_row_t)); + if (rows && saved_name) { + index.rows = rows; + index.rows[index.count].guest_name = saved_name; + memcpy(index.rows[index.count].token, saved_row.token, + sizeof(saved_row.token)); + index.count++; + (void) sidecar_write_locked_index(parent.dirfd, lock_fd, + &index); + } else { + free(saved_name); + } + errno = saved_errno; + rc = linux_errno(); + } else { + free(saved_name); + } + } + } else { + int exists = sidecar_exact_name_exists(parent.dirfd, parent.basename); + if (exists < 0) + rc = linux_errno(); + else if (exists == 0) + rc = -LINUX_ENOENT; + else if (unlinkat(parent.dirfd, parent.basename, flags) < 0) + rc = linux_errno(); + } + + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&parent); + return rc; +} + +static int sidecar_resolve_existing_at(guest_fd_t dirfd, + const char *path, + sidecar_parent_t *parent, + char host_name[NAME_MAX + 1]) +{ + if (sidecar_walk_parent_at(dirfd, path, parent) < 0) + return -1; + + sidecar_index_t index; + if (sidecar_load_index(parent->dirfd, &index) < 0) { + sidecar_parent_close(parent); + return -1; + } + + const char *mapped = sidecar_lookup_guest(&index, parent->basename); + if (mapped) { + str_copy_trunc(host_name, mapped, NAME_MAX + 1); + sidecar_index_free(&index); + return 0; + } + sidecar_index_free(&index); + + int exists = sidecar_exact_name_exists(parent->dirfd, parent->basename); + if (exists < 0) { + sidecar_parent_close(parent); + return -1; + } + if (exists == 0) { + sidecar_parent_close(parent); + errno = ENOENT; + return -1; + } + str_copy_trunc(host_name, parent->basename, NAME_MAX + 1); + return 0; +} + +int64_t sidecar_linkat(guest_fd_t olddirfd, + const char *oldpath, + guest_fd_t newdirfd, + const char *newpath, + int flags) +{ + if (!sidecar_active()) + return SIDECAR_NOT_HANDLED; + + sidecar_parent_t old_parent; + char old_host[NAME_MAX + 1]; + if (sidecar_resolve_existing_at(olddirfd, oldpath, &old_parent, old_host) < + 0) { + return linux_errno(); + } + + sidecar_parent_t new_parent; + if (sidecar_walk_parent_at(newdirfd, newpath, &new_parent) < 0) { + sidecar_parent_close(&old_parent); + return linux_errno(); + } + + int lock_fd = -1; + if (sidecar_lock_index(new_parent.dirfd, &lock_fd) < 0) { + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return linux_errno(); + } + + sidecar_index_t index; + if (sidecar_load_locked_index(new_parent.dirfd, lock_fd, &index) < 0) { + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return linux_errno(); + } + if (sidecar_existing_name_locked(&index, new_parent.dirfd, + new_parent.basename)) { + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return -LINUX_EEXIST; + } + + char token[SIDECAR_TOKEN_NAME_LEN + 1]; + int rc = -LINUX_EIO; + int mac_flags = translate_at_flags(flags); + for (;;) { + if (sidecar_generate_token(token) < 0) + break; + if (linkat(old_parent.dirfd, old_host, new_parent.dirfd, token, + mac_flags) == 0) { + rc = 0; + break; + } + if (errno != EEXIST) { + rc = linux_errno(); + break; + } + } + if (rc == 0) { + if (sidecar_append_guest_locked(&index, new_parent.basename, token) < + 0) { + int saved_errno = errno; + unlinkat(new_parent.dirfd, token, 0); + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + errno = saved_errno; + return linux_errno(); + } + if (sidecar_write_locked_index(new_parent.dirfd, lock_fd, &index) < 0) { + int saved_errno = errno; + unlinkat(new_parent.dirfd, token, 0); + errno = saved_errno; + rc = linux_errno(); + } + } + + sidecar_index_free(&index); + sidecar_unlock_index(lock_fd); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return rc; +} + +typedef struct { + size_t index_pos; + bool mapped; + bool exists; + char host_name[NAME_MAX + 1]; +} sidecar_entry_state_t; + +static int sidecar_read_entry_state(const sidecar_index_t *index, + int dirfd, + const char *guest_name, + sidecar_entry_state_t *state) +{ + memset(state, 0, sizeof(*state)); + state->index_pos = index->count; + + ssize_t mapped_idx = sidecar_find_guest_index(index, guest_name); + if (mapped_idx >= 0) { + state->mapped = true; + state->exists = true; + state->index_pos = (size_t) mapped_idx; + str_copy_trunc(state->host_name, index->rows[mapped_idx].token, + sizeof(state->host_name)); + return 0; + } + + int exact = sidecar_exact_name_exists(dirfd, guest_name); + if (exact < 0) + return -1; + if (exact == 1) { + state->exists = true; + str_copy_trunc(state->host_name, guest_name, sizeof(state->host_name)); + } + return 0; +} + +static int sidecar_lock_two_indices(sidecar_parent_t *first, + sidecar_parent_t *second, + bool same_dir, + int *first_lock_fd, + int *second_lock_fd, + bool *swapped) +{ + struct stat first_st; + struct stat second_st; + if (sidecar_parent_stat(first, &first_st) < 0 || + sidecar_parent_stat(second, &second_st) < 0) { + return -1; + } + + sidecar_parent_t *lock_a = first; + sidecar_parent_t *lock_b = second; + *swapped = false; + if (first_st.st_dev > second_st.st_dev || + (first_st.st_dev == second_st.st_dev && + first_st.st_ino > second_st.st_ino)) { + lock_a = second; + lock_b = first; + *swapped = true; + } + + pthread_mutex_lock(&sidecar_global_lock); + if (sidecar_lock_index_fcntl(lock_a->dirfd, first_lock_fd) < 0) { + int saved_errno = errno; + pthread_mutex_unlock(&sidecar_global_lock); + errno = saved_errno; + return -1; + } + if (same_dir) { + *second_lock_fd = *first_lock_fd; + return 0; + } + if (sidecar_lock_index_fcntl(lock_b->dirfd, second_lock_fd) < 0) { + int saved_errno = errno; + sidecar_unlock_index_fcntl(*first_lock_fd); + *first_lock_fd = -1; + pthread_mutex_unlock(&sidecar_global_lock); + errno = saved_errno; + return -1; + } + return 0; +} + +static void sidecar_unlock_two_indices(int first_lock_fd, + int second_lock_fd, + bool same_dir) +{ + if (same_dir) { + sidecar_unlock_index_fcntl(first_lock_fd); + } else { + sidecar_unlock_index_fcntl(second_lock_fd); + sidecar_unlock_index_fcntl(first_lock_fd); + } + pthread_mutex_unlock(&sidecar_global_lock); +} + +int64_t sidecar_renameat(guest_fd_t olddirfd, + const char *oldpath, + guest_fd_t newdirfd, + const char *newpath, + int flags) +{ + if (!sidecar_active()) + return SIDECAR_NOT_HANDLED; + + if (flags & LINUX_RENAME_EXCHANGE) { + char old_host_path[LINUX_PATH_MAX]; + char new_host_path[LINUX_PATH_MAX]; + int old_rc = sidecar_translate_lookup_at( + olddirfd, oldpath, old_host_path, sizeof(old_host_path)); + int new_rc = sidecar_translate_lookup_at( + newdirfd, newpath, new_host_path, sizeof(new_host_path)); + if (old_rc < 0 || new_rc < 0) + return linux_errno(); + if (old_rc == 0 || new_rc == 0) + return SIDECAR_NOT_HANDLED; + + if (renamex_np(old_host_path, new_host_path, RENAME_SWAP) < 0) + return linux_errno(); + return 0; + } + + sidecar_parent_t old_parent; + if (sidecar_walk_parent_at(olddirfd, oldpath, &old_parent) < 0) + return linux_errno(); + + sidecar_parent_t new_parent; + if (sidecar_walk_parent_at(newdirfd, newpath, &new_parent) < 0) { + sidecar_parent_close(&old_parent); + return linux_errno(); + } + + if (!strcmp(old_parent.basename, new_parent.basename)) { + struct stat old_same; + struct stat new_same; + if (fstat(old_parent.dirfd, &old_same) == 0 && + fstat(new_parent.dirfd, &new_same) == 0 && + old_same.st_dev == new_same.st_dev && + old_same.st_ino == new_same.st_ino) { + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return 0; + } + } + + struct stat old_dir_st; + struct stat new_dir_st; + if (fstat(old_parent.dirfd, &old_dir_st) < 0 || + fstat(new_parent.dirfd, &new_dir_st) < 0) { + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return linux_errno(); + } + bool same_dir = old_dir_st.st_dev == new_dir_st.st_dev && + old_dir_st.st_ino == new_dir_st.st_ino; + + int first_lock_fd = -1; + int second_lock_fd = -1; + bool swapped = false; + if (sidecar_lock_two_indices(&old_parent, &new_parent, same_dir, + &first_lock_fd, &second_lock_fd, + &swapped) < 0) { + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return linux_errno(); + } + + sidecar_index_t old_index = {0}; + sidecar_index_t new_index = {0}; + int rc; + if (same_dir) { + rc = sidecar_load_locked_index(old_parent.dirfd, first_lock_fd, + &old_index); + } else if (!swapped) { + rc = sidecar_load_locked_index(old_parent.dirfd, first_lock_fd, + &old_index); + if (rc == 0) + rc = sidecar_load_locked_index(new_parent.dirfd, second_lock_fd, + &new_index); + } else { + rc = sidecar_load_locked_index(new_parent.dirfd, first_lock_fd, + &new_index); + if (rc == 0) + rc = sidecar_load_locked_index(old_parent.dirfd, second_lock_fd, + &old_index); + } + if (rc < 0) { + if (!same_dir) { + sidecar_index_free(&old_index); + sidecar_index_free(&new_index); + } else { + sidecar_index_free(&old_index); + } + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return linux_errno(); + } + + /* Snapshot the loaded indices so that a host renameat failure later + * can roll the on-disk index back. Without this, an index update + * followed by a failed host renameat leaves the mapping pointing at + * a moved or missing token. + */ + sidecar_index_t saved_old = {0}; + sidecar_index_t saved_new = {0}; + if (sidecar_index_clone(&old_index, &saved_old) < 0 || + (!same_dir && sidecar_index_clone(&new_index, &saved_new) < 0)) { + int64_t err = linux_errno(); + sidecar_index_free(&saved_old); + if (!same_dir) { + sidecar_index_free(&new_index); + sidecar_index_free(&saved_new); + } + sidecar_index_free(&old_index); + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return err; + } + + sidecar_index_t *dst_index = same_dir ? &old_index : &new_index; + sidecar_entry_state_t old_state; + sidecar_entry_state_t new_state; + if (sidecar_read_entry_state(&old_index, old_parent.dirfd, + old_parent.basename, &old_state) < 0 || + sidecar_read_entry_state(dst_index, new_parent.dirfd, + new_parent.basename, &new_state) < 0) { + int64_t err = linux_errno(); + if (!same_dir) + sidecar_index_free(&new_index); + sidecar_index_free(&old_index); + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return err; + } + + if (!old_state.exists) { + if (!same_dir) + sidecar_index_free(&new_index); + sidecar_index_free(&old_index); + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return -LINUX_ENOENT; + } + if ((flags & LINUX_RENAME_NOREPLACE) && new_state.exists) { + if (!same_dir) + sidecar_index_free(&new_index); + sidecar_index_free(&old_index); + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return -LINUX_EEXIST; + } + + char target_host[NAME_MAX + 1]; + bool add_new_mapping = false; + bool rename_existing_old_mapping = + same_dir && old_state.mapped && !new_state.exists; + if (new_state.exists) { + str_copy_trunc(target_host, new_state.host_name, sizeof(target_host)); + } else if (old_state.mapped) { + str_copy_trunc(target_host, old_state.host_name, sizeof(target_host)); + add_new_mapping = !same_dir; + } else { + for (;;) { + if (sidecar_generate_token(target_host) < 0) + break; + int probe = fstatat(new_parent.dirfd, target_host, + &(struct stat) {0}, AT_SYMLINK_NOFOLLOW); + if (probe < 0 && errno == ENOENT) { + add_new_mapping = true; + break; + } + if (probe == 0) + continue; + if (errno == ENOENT) + continue; + break; + } + if (!add_new_mapping) { + int64_t err = linux_errno(); + sidecar_index_free(&saved_old); + if (!same_dir) { + sidecar_index_free(&saved_new); + sidecar_index_free(&new_index); + } + sidecar_index_free(&old_index); + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return err; + } + } + + int64_t result = 0; + int mod_rc = 0; + if (rename_existing_old_mapping) { + free(old_index.rows[old_state.index_pos].guest_name); + old_index.rows[old_state.index_pos].guest_name = + strdup(new_parent.basename); + if (!old_index.rows[old_state.index_pos].guest_name) + mod_rc = -1; + } else if (old_state.mapped) { + if (sidecar_remove_guest_locked(&old_index, old_state.index_pos) < 0) + mod_rc = -1; + } + if (mod_rc == 0) { + if (new_state.mapped) { + size_t idx = new_state.index_pos; + if (same_dir && old_state.mapped && idx > old_state.index_pos) + idx--; + free(dst_index->rows[idx].guest_name); + dst_index->rows[idx].guest_name = strdup(new_parent.basename); + if (!dst_index->rows[idx].guest_name) + mod_rc = -1; + } else if (add_new_mapping) { + if (sidecar_append_guest_locked(dst_index, new_parent.basename, + target_host) < 0) + mod_rc = -1; + } + } + if (mod_rc < 0) { + result = linux_errno(); + goto cleanup; + } + + /* Commit the index changes to disk before any host filesystem + * mutation so a failed write does not leave an orphan host file. The + * host renameat is the actual commit point; on host failure, revert + * the on-disk index by writing the saved snapshot back. + */ + if (same_dir) { + rc = sidecar_write_locked_index(old_parent.dirfd, first_lock_fd, + &old_index); + } else if (!swapped) { + rc = sidecar_write_locked_index(old_parent.dirfd, first_lock_fd, + &old_index); + if (rc == 0) + rc = sidecar_write_locked_index(new_parent.dirfd, second_lock_fd, + &new_index); + } else { + rc = sidecar_write_locked_index(new_parent.dirfd, first_lock_fd, + &new_index); + if (rc == 0) + rc = sidecar_write_locked_index(old_parent.dirfd, second_lock_fd, + &old_index); + } + if (rc < 0) { + result = linux_errno(); + goto cleanup; + } + + if (!(same_dir && old_state.mapped && !new_state.exists && + !strcmp(target_host, old_state.host_name))) { + if (renameat(old_parent.dirfd, old_state.host_name, new_parent.dirfd, + target_host) < 0) { + result = linux_errno(); + /* Roll the index back to the pre-modification state so the + * mapping stays consistent with the unchanged host tree. A + * failed rollback write is the best-effort case; the guest + * sees the original renameat errno regardless. + */ + if (same_dir) { + (void) sidecar_write_locked_index(old_parent.dirfd, + first_lock_fd, &saved_old); + } else if (!swapped) { + (void) sidecar_write_locked_index(old_parent.dirfd, + first_lock_fd, &saved_old); + (void) sidecar_write_locked_index(new_parent.dirfd, + second_lock_fd, &saved_new); + } else { + (void) sidecar_write_locked_index(new_parent.dirfd, + first_lock_fd, &saved_new); + (void) sidecar_write_locked_index(old_parent.dirfd, + second_lock_fd, &saved_old); + } + } + } + +cleanup: + sidecar_index_free(&saved_old); + if (!same_dir) { + sidecar_index_free(&saved_new); + sidecar_index_free(&new_index); + } + sidecar_index_free(&old_index); + sidecar_unlock_two_indices(first_lock_fd, second_lock_fd, same_dir); + sidecar_parent_close(&old_parent); + sidecar_parent_close(&new_parent); + return result; +} diff --git a/src/syscall/sidecar.h b/src/syscall/sidecar.h new file mode 100644 index 0000000..a28dfce --- /dev/null +++ b/src/syscall/sidecar.h @@ -0,0 +1,48 @@ +/* Case-folding fallback VFS helpers + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +#include "syscall/internal.h" + +#define SIDECAR_INDEX_NAME ".elfuse_case_index" +#define SIDECAR_TOKEN_PREFIX ".ef_" +#define SIDECAR_TOKEN_HEX_LEN 16 +#define SIDECAR_TOKEN_NAME_LEN (4 + SIDECAR_TOKEN_HEX_LEN) +#define SIDECAR_NOT_HANDLED ((int64_t) INT64_MIN) + +bool sidecar_active(void); +bool sidecar_name_reserved(const char *name); +bool sidecar_path_targets_reserved_name(const char *path); +int sidecar_translate_lookup_at(guest_fd_t dirfd, + const char *path, + char *out, + size_t outsz); +int sidecar_translate_dirent_name(guest_fd_t dirfd, + const char *host_name, + char *guest_name, + size_t guest_name_sz); +int sidecar_openat(guest_fd_t dirfd, + const char *path, + int linux_flags, + mode_t mode); +int64_t sidecar_mkdirat(guest_fd_t dirfd, const char *path, mode_t mode); +int64_t sidecar_unlinkat(guest_fd_t dirfd, const char *path, int flags); +int64_t sidecar_linkat(guest_fd_t olddirfd, + const char *oldpath, + guest_fd_t newdirfd, + const char *newpath, + int flags); +int64_t sidecar_renameat(guest_fd_t olddirfd, + const char *oldpath, + guest_fd_t newdirfd, + const char *newpath, + int flags); diff --git a/tests/test-case-collision.c b/tests/test-case-collision.c new file mode 100644 index 0000000..02a8faf --- /dev/null +++ b/tests/test-case-collision.c @@ -0,0 +1,482 @@ +/* Case-collision regression tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" +#include "test-util.h" + +#ifndef SYS_renameat2 +#define SYS_renameat2 276 +#endif + +#ifndef SYS_getdents64 +#define SYS_getdents64 61 +#endif + +#ifndef SYS_statx +#define SYS_statx 291 +#endif + +#define LINUX_RENAME_EXCHANGE (1 << 1) + +int passes = 0, fails = 0; + +typedef struct { + unsigned long long d_ino; + long long d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +} linux_dirent64_t; + +static int create_file(const char *path, const char *contents) +{ + int fd = open(path, O_CREAT | O_TRUNC | O_WRONLY, 0644); + if (fd < 0) + return -1; + size_t len = strlen(contents); + int rc = write_fd_all(fd, contents, len); + close(fd); + return rc; +} + +static int dir_has_entry(const char *path, const char *needle) +{ + DIR *dir = opendir(path); + if (!dir) + return -1; + + int found = 0; + struct dirent *de; + while ((de = readdir(dir)) != NULL) { + if (!strcmp(de->d_name, needle)) { + found = 1; + break; + } + } + closedir(dir); + return found; +} + +static void build_long_name(char *out, size_t outsz, char first) +{ + memset(out, 'a', outsz - 1); + out[0] = first; + out[outsz - 1] = '\0'; +} + +static int xattr_supported(void) +{ + const char *probe = "/tmp/elfuse-case-collision-xattr-probe"; + unlink(probe); + if (create_file(probe, "probe\n") < 0) + return 0; + + int rc = setxattr(probe, "user.elfuse_probe", "x", 1, 0); + int ok = (rc == 0 || errno == ENOTSUP || errno == EOPNOTSUPP); + unlink(probe); + return ok; +} + +static int getdents_contains_after_partial(const char *dir_path, + const char *first_name, + const char *second_name) +{ + int fd = open(dir_path, O_RDONLY | O_DIRECTORY); + if (fd < 0) + return 0; + + char small[48]; + char large[1024]; + long n1 = syscall(SYS_getdents64, fd, small, sizeof(small)); + if (n1 < 0) { + close(fd); + return 0; + } + + int saw_first = 0; + int saw_second = 0; + for (;;) { + long n = syscall(SYS_getdents64, fd, large, sizeof(large)); + if (n < 0) { + close(fd); + return 0; + } + if (n == 0) + break; + long off = 0; + while (off < n) { + linux_dirent64_t *de = (linux_dirent64_t *) (large + off); + if (!strcmp(de->d_name, first_name)) + saw_first = 1; + if (!strcmp(de->d_name, second_name)) + saw_second = 1; + off += de->d_reclen; + } + } + + int reopen_fd = openat(fd, ".", O_RDONLY | O_DIRECTORY); + close(fd); + if (reopen_fd < 0) + return 0; + + saw_first = 0; + saw_second = 0; + for (;;) { + long n = syscall(SYS_getdents64, reopen_fd, large, sizeof(large)); + if (n < 0) { + close(reopen_fd); + return 0; + } + if (n == 0) + break; + long off = 0; + while (off < n) { + linux_dirent64_t *de = (linux_dirent64_t *) (large + off); + if (!strcmp(de->d_name, first_name)) + saw_first = 1; + if (!strcmp(de->d_name, second_name)) + saw_second = 1; + off += de->d_reclen; + } + } + close(reopen_fd); + return saw_first && saw_second; +} + +static int sidecar_fallback_active(const char *dir_path) +{ + char index_path[512]; + snprintf(index_path, sizeof(index_path), "%s/.elfuse_case_index", dir_path); + return access(index_path, F_OK) == 0; +} + +int main(void) +{ + char base[256]; + char dir_a[320]; + char dir_b[320]; + snprintf(base, sizeof(base), "/tmp/elfuse-case-collision-%ld", + (long) getpid()); + snprintf(dir_a, sizeof(dir_a), "%s/dir-a", base); + snprintf(dir_b, sizeof(dir_b), "%s/dir-b", base); + + mkdir("/tmp", 0777); + mkdir(base, 0777); + mkdir(dir_a, 0777); + mkdir(dir_b, 0777); + + printf("test-case-collision: case collision tests\n"); + + TEST("readdir lists Foo and foo distinctly"); + { + char upper[320]; + char lower[320]; + snprintf(upper, sizeof(upper), "%s/Foo", base); + snprintf(lower, sizeof(lower), "%s/foo", base); + + unlink(upper); + unlink(lower); + if (create_file(upper, "upper\n") < 0 || + create_file(lower, "lower\n") < 0) { + FAIL("failed to create colliding files"); + } else if (dir_has_entry(base, "Foo") != 1 || + dir_has_entry(base, "foo") != 1) { + FAIL("readdir collapsed colliding names"); + } else { + PASS(); + } + } + + TEST("renameat2 exchange swaps Foo and foo"); + { + char upper[320]; + char lower[320]; + char buf_upper[32]; + char buf_lower[32]; + snprintf(upper, sizeof(upper), "%s/Foo", base); + snprintf(lower, sizeof(lower), "%s/foo", base); + + if (syscall(SYS_renameat2, AT_FDCWD, upper, AT_FDCWD, lower, + LINUX_RENAME_EXCHANGE) < 0) { + FAIL("renameat2 exchange failed"); + } else if (read_file_nul(upper, buf_upper, sizeof(buf_upper)) <= 0 || + read_file_nul(lower, buf_lower, sizeof(buf_lower)) <= 0) { + FAIL("failed to read exchanged files"); + } else if (strcmp(buf_upper, "lower\n") || + strcmp(buf_lower, "upper\n")) { + FAIL("renameat2 exchange produced wrong contents"); + } else { + PASS(); + } + } + + TEST("renameat2 exchange swaps colliding names across directories"); + { + char left[320]; + char right[320]; + char buf_left[32]; + char buf_right[32]; + snprintf(left, sizeof(left), "%s/Foo", dir_a); + snprintf(right, sizeof(right), "%s/foo", dir_b); + unlink(left); + unlink(right); + + if (create_file(left, "left\n") < 0 || + create_file(right, "right\n") < 0) { + FAIL("failed to create cross-directory colliding files"); + } else if (syscall(SYS_renameat2, AT_FDCWD, left, AT_FDCWD, right, + LINUX_RENAME_EXCHANGE) < 0) { + FAIL("cross-directory rename exchange failed"); + } else if (read_file_nul(left, buf_left, sizeof(buf_left)) <= 0 || + read_file_nul(right, buf_right, sizeof(buf_right)) <= 0) { + FAIL("cross-directory exchanged files not readable"); + } else if (strcmp(buf_left, "right\n") || strcmp(buf_right, "left\n")) { + FAIL("cross-directory exchange contents mismatch"); + } else { + PASS(); + } + } + + TEST("linkat creates second colliding spelling"); + { + char src[320]; + char alias[320]; + struct stat st_src; + struct stat st_alias; + + snprintf(src, sizeof(src), "%s/hardlink", base); + snprintf(alias, sizeof(alias), "%s/HARDLINK", base); + unlink(src); + unlink(alias); + + if (create_file(src, "inode\n") < 0) { + FAIL("failed to create hardlink source"); + } else if (link(src, alias) < 0) { + FAIL("linkat failed"); + } else if (stat(src, &st_src) < 0 || stat(alias, &st_alias) < 0) { + FAIL("stat after link failed"); + } else if (st_src.st_ino != st_alias.st_ino || st_src.st_nlink < 2) { + FAIL("colliding hardlinks do not share inode"); + } else if (unlink(src) < 0 || stat(alias, &st_alias) < 0 || + dir_has_entry(base, "hardlink") != 0 || + dir_has_entry(base, "HARDLINK") != 1) { + FAIL("unlink removed wrong hardlink entry"); + } else { + PASS(); + } + } + + TEST("access and statx distinguish colliding spellings"); + { + char upper[320]; + char lower[320]; + struct statx sx_upper; + struct statx sx_lower; + snprintf(upper, sizeof(upper), "%s/Foo", base); + snprintf(lower, sizeof(lower), "%s/foo", base); + memset(&sx_upper, 0, sizeof(sx_upper)); + memset(&sx_lower, 0, sizeof(sx_lower)); + + if (access(upper, F_OK) < 0 || access(lower, F_OK) < 0) { + FAIL("access on colliding spellings failed"); + } else if (syscall(SYS_statx, AT_FDCWD, upper, 0, 0x7ff, &sx_upper) < + 0 || + syscall(SYS_statx, AT_FDCWD, lower, 0, 0x7ff, &sx_lower) < + 0) { + FAIL("statx on colliding spellings failed"); + } else if (!S_ISREG(sx_upper.stx_mode) || !S_ISREG(sx_lower.stx_mode)) { + FAIL("statx returned wrong file type"); + } else { + PASS(); + } + } + + TEST("getdents64 survives partial read and reopen-by-fd"); + { + if (getdents_contains_after_partial(base, "Foo", "foo")) + PASS(); + else + FAIL("getdents64 lost colliding names after partial read"); + } + + TEST("xattr works on colliding spellings"); + { + char upper[320]; + char lower[320]; + char value[32]; + snprintf(upper, sizeof(upper), "%s/Foo", base); + snprintf(lower, sizeof(lower), "%s/foo", base); + memset(value, 0, sizeof(value)); + + errno = 0; + if (setxattr(upper, "user.elfuse_case", "upper", 5, 0) < 0 && + errno != ENOTSUP && errno != EOPNOTSUPP) { + FAIL("setxattr on colliding spelling failed"); + } else if (errno == ENOTSUP || errno == EOPNOTSUPP) { + PASS(); + } else if (setxattr(lower, "user.elfuse_case", "lower", 5, 0) < 0) { + FAIL("setxattr on second colliding spelling failed"); + } else if (getxattr(upper, "user.elfuse_case", value, sizeof(value)) != + 5) { + FAIL("getxattr upper failed"); + } else if (strcmp(value, "upper")) { + FAIL("upper xattr value mismatch"); + } else { + memset(value, 0, sizeof(value)); + if (getxattr(lower, "user.elfuse_case", value, sizeof(value)) != + 5) { + FAIL("getxattr lower failed"); + } else if (strcmp(value, "lower")) { + FAIL("lower xattr value mismatch"); + } else { + PASS(); + } + } + } + + TEST("plain rename updates sidecar mapping for colliding source"); + { + char old_path[320]; + char new_path[320]; + char untouched_path[320]; + char value[32]; + + snprintf(old_path, sizeof(old_path), "%s/foo", base); + snprintf(new_path, sizeof(new_path), "%s/bar", base); + snprintf(untouched_path, sizeof(untouched_path), "%s/Foo", base); + unlink(new_path); + + if (rename(old_path, new_path) < 0) { + FAIL("plain rename failed"); + } else if (access(old_path, F_OK) == 0 || errno != ENOENT) { + FAIL("old colliding spelling still resolves after rename"); + } else if (read_file_nul(new_path, value, sizeof(value)) <= 0) { + FAIL("renamed colliding spelling not readable"); + } else if (strcmp(value, "upper\n") && strcmp(value, "lower\n")) { + FAIL("renamed colliding spelling has unexpected contents"); + } else if (raw_open_rdonly(untouched_path) < 0) { + FAIL("rename disturbed untouched colliding entry"); + } else if (dir_has_entry(base, "foo") != 0 || + dir_has_entry(base, "bar") != 1) { + FAIL("directory listing did not reflect sidecar rename"); + } else { + PASS(); + } + } + + TEST("renameat2 NOREPLACE preserves existing colliding destination"); + { + char src[320]; + char dst[320]; + + snprintf(src, sizeof(src), "%s/bar", base); + snprintf(dst, sizeof(dst), "%s/Foo", base); + + errno = 0; + if (syscall(SYS_renameat2, AT_FDCWD, src, AT_FDCWD, dst, + 1 /* RENAME_NOREPLACE */) != -1) { + FAIL("renameat2 NOREPLACE unexpectedly succeeded"); + } else if (errno != EEXIST) { + FAIL("renameat2 NOREPLACE returned wrong errno"); + } else if (access(src, F_OK) < 0 || access(dst, F_OK) < 0) { + FAIL("renameat2 NOREPLACE disturbed source or destination"); + } else { + PASS(); + } + } + + TEST("fallback linkat preserves AT_SYMLINK_FOLLOW semantics"); + { + char target[320]; + char link_path[320]; + char hard_path[320]; + struct stat st; + + snprintf(target, sizeof(target), "%s/real-target", base); + snprintf(link_path, sizeof(link_path), "%s/real-link", base); + snprintf(hard_path, sizeof(hard_path), "%s/REAL-HARD", base); + unlink(hard_path); + unlink(link_path); + unlink(target); + + if (create_file(target, "follow\n") < 0) { + FAIL("failed to create link target"); + } else if (symlink(target, link_path) < 0) { + FAIL("failed to create symlink"); + } else if (!sidecar_fallback_active(base)) { + PASS(); + } else if (linkat(AT_FDCWD, link_path, AT_FDCWD, hard_path, + AT_SYMLINK_FOLLOW) < 0) { + FAIL("linkat with AT_SYMLINK_FOLLOW failed"); + } else if (lstat(hard_path, &st) < 0) { + FAIL("lstat on hardlink target failed"); + } else if (!S_ISREG(st.st_mode)) { + FAIL("sidecar fallback linked the symlink instead of its target"); + } else if (linkat(AT_FDCWD, target, AT_FDCWD, hard_path, 0x40000000) != + -1 || + errno != EINVAL) { + FAIL("sidecar fallback accepted unsupported linkat flags"); + } else { + PASS(); + } + } + + TEST("fallback rejects reserved sidecar basename for create paths"); + { + char poison[320]; + snprintf(poison, sizeof(poison), "%s/.elfuse_case_index", base); + unlink(poison); + + if (!sidecar_fallback_active(base)) { + PASS(); + } else if (symlinkat("target", AT_FDCWD, poison) != -1 || + errno != ENOENT) { + FAIL("reserved sidecar basename was creatable"); + } else if (!sidecar_fallback_active(base)) { + FAIL("reserved-name probe disturbed sidecar metadata"); + } else { + PASS(); + } + } + + TEST("255-byte colliding basenames both open"); + { + char name_a[256]; + char name_b[256]; + char path_a[512]; + char path_b[512]; + + build_long_name(name_a, sizeof(name_a), 'a'); + build_long_name(name_b, sizeof(name_b), 'A'); + snprintf(path_a, sizeof(path_a), "%s/%s", base, name_a); + snprintf(path_b, sizeof(path_b), "%s/%s", base, name_b); + unlink(path_a); + unlink(path_b); + + if (create_file(path_a, "long-a\n") < 0 || + create_file(path_b, "long-b\n") < 0) { + FAIL("failed to create long colliding names"); + } else if (raw_open_rdonly(path_a) < 0 || raw_open_rdonly(path_b) < 0) { + FAIL("failed to reopen long colliding names"); + } else { + PASS(); + } + } + + SUMMARY("test-case-collision"); + return fails > 0 ? 1 : 0; +} diff --git a/tests/test-sysroot-create-paths.c b/tests/test-sysroot-create-paths.c new file mode 100644 index 0000000..3ff1082 --- /dev/null +++ b/tests/test-sysroot-create-paths.c @@ -0,0 +1,256 @@ +/* Sysroot create-path routing regression tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" +#include "test-util.h" + +int passes = 0, fails = 0; + +static int write_file(const char *path, const char *contents) +{ + int fd = open(path, O_CREAT | O_TRUNC | O_WRONLY, 0644); + if (fd < 0) + return -1; + int rc = write_fd_all(fd, contents, strlen(contents)); + close(fd); + return rc; +} + +static int xattr_probe(const char *path) +{ + errno = 0; + int rc = setxattr(path, "user.elfuse_probe", "x", 1, 0); + if (rc == 0) { + removexattr(path, "user.elfuse_probe"); + return 1; + } + if (errno == ENOTSUP || errno == EOPNOTSUPP) + return 0; + return -1; +} + +int main(int argc, char **argv) +{ + if (argc != 5) { + fprintf(stderr, + "usage: %s " + " \n", + argv[0]); + return 2; + } + + const char *guest_tmp_path = argv[1]; + const char *mounted_host_tmp_path = argv[2]; + const char *host_fallback_path = argv[3]; + const char *mounted_sysroot_root = argv[4]; + char buf[256]; + + printf("test-sysroot-create-paths: create-path routing tests\n"); + + TEST("/tmp create is redirected into sysroot"); + { + if (write_file(guest_tmp_path, "tmp-redir\n") < 0) { + FAIL("write via guest /tmp path failed"); + } else if (read_file_nul(guest_tmp_path, buf, sizeof(buf)) <= 0) { + FAIL("read back via guest /tmp path failed"); + } else if (strcmp(buf, "tmp-redir\n")) { + FAIL("guest /tmp file contents mismatch"); + } else if (read_file_nul(mounted_host_tmp_path, buf, sizeof(buf)) <= + 0) { + FAIL("mounted sysroot tmp path not created"); + } else if (strcmp(buf, "tmp-redir\n")) { + FAIL("mounted sysroot tmp file contents mismatch"); + } else { + PASS(); + } + } + + TEST("non-sysroot absolute create falls back to host"); + { + if (write_file(host_fallback_path, "host-fallback\n") < 0) { + FAIL("host fallback create failed"); + } else if (read_file_nul(host_fallback_path, buf, sizeof(buf)) <= 0) { + FAIL("host fallback readback failed"); + } else if (strcmp(buf, "host-fallback\n")) { + FAIL("host fallback file contents mismatch"); + } else { + PASS(); + } + } + + TEST("relative xattr uses guest cwd"); + { + const char *rel_dir = "/tmp/elfuse-sysroot-create-paths"; + const char *rel_file = "rel-xattr.txt"; + char value[32]; + memset(value, 0, sizeof(value)); + + if (chdir(rel_dir) < 0) { + FAIL("chdir into redirected tmp dir failed"); + } else if (write_file(rel_file, "cwd\n") < 0) { + FAIL("relative file create in guest cwd failed"); + } else { + int probe = xattr_probe(rel_file); + if (probe < 0) { + FAIL("relative xattr probe failed"); + } else if (probe == 0) { + PASS(); + } else if (setxattr(rel_file, "user.elfuse_relx", "cwd", 3, 0) < + 0) { + FAIL("relative setxattr failed"); + } else if (getxattr(rel_file, "user.elfuse_relx", value, + sizeof(value)) != 3) { + FAIL("relative getxattr failed"); + } else if (strcmp(value, "cwd")) { + FAIL("relative xattr value mismatch"); + } else { + PASS(); + } + } + } + + TEST("redirected /tmp create rejects sysroot symlink escape"); + { + char sysroot_tmp[512]; + char escape_dir[512]; + char guest_escape_path[512]; + char host_escape_file[512]; + char *slash; + + if (snprintf(sysroot_tmp, sizeof(sysroot_tmp), "%s/tmp", + mounted_sysroot_root) >= (int) sizeof(sysroot_tmp) || + snprintf(escape_dir, sizeof(escape_dir), "%s", + host_fallback_path) >= (int) sizeof(escape_dir) || + snprintf(guest_escape_path, sizeof(guest_escape_path), + "/tmp/escape-check/file.txt") >= + (int) sizeof(guest_escape_path)) { + FAIL("symlink escape paths too long"); + } else { + slash = strrchr(escape_dir, '/'); + if (!slash) { + FAIL("host fallback path has no parent directory"); + goto symlink_escape_done; + } + *slash = '\0'; + if (snprintf(host_escape_file, sizeof(host_escape_file), + "%s/escape-check/file.txt", + escape_dir) >= (int) sizeof(host_escape_file)) { + FAIL("symlink escape paths too long"); + } else if (unlink("/tmp/elfuse-sysroot-create-paths/file.txt") < + 0 && + errno != ENOENT) { + FAIL("failed to remove redirected tmp test file"); + } else if (unlink( + "/tmp/elfuse-sysroot-create-paths/rel-xattr.txt") < + 0 && + errno != ENOENT) { + FAIL("failed to remove redirected tmp xattr test file"); + } else if (rmdir("/tmp/elfuse-sysroot-create-paths") < 0 && + errno != ENOENT) { + FAIL("failed to remove redirected tmp test directory"); + } else if (rmdir(sysroot_tmp) < 0) { + FAIL("failed to remove sysroot tmp dir before symlink test"); + } else if (symlink(escape_dir, sysroot_tmp) < 0) { + FAIL("failed to install sysroot tmp symlink"); + } else { + errno = 0; + if (write_file(guest_escape_path, "escape\n") == 0) { + FAIL("redirected /tmp create followed symlink escape"); + } else if (access(host_escape_file, F_OK) == 0) { + FAIL("redirected /tmp create escaped into host path"); + } else { + PASS(); + } + } + } + symlink_escape_done:; + } + + TEST("failed unlink does not create redirected parent dirs"); + { + char sysroot_tmp[512]; + char guest_missing_path[512]; + + if (snprintf(sysroot_tmp, sizeof(sysroot_tmp), "%s/tmp", + mounted_sysroot_root) >= (int) sizeof(sysroot_tmp) || + snprintf(guest_missing_path, sizeof(guest_missing_path), + "/tmp/unlink-missing/file.txt") >= + (int) sizeof(guest_missing_path)) { + FAIL("unlink side-effect paths too long"); + } else if (unlink(sysroot_tmp) < 0 && errno != ENOENT) { + FAIL("failed to remove sysroot tmp symlink before unlink test"); + } else if (unlink("/tmp/unlink-missing/file.txt") < 0 && + errno != ENOENT) { + FAIL("failed to remove previous redirected tmp file"); + } else if (rmdir("/tmp/unlink-missing") < 0 && errno != ENOENT) { + FAIL("failed to remove previous redirected tmp directory"); + } else if (unlink(guest_missing_path) == 0) { + FAIL("unlink unexpectedly succeeded for missing redirected path"); + } else if (access(sysroot_tmp, F_OK) == 0) { + FAIL("failed unlink created redirected tmp parent"); + } else { + PASS(); + } + } + + TEST("failed unlink rejects redirected tmp symlink escape"); + { + char sysroot_tmp[512]; + char escape_dir[512]; + char guest_missing_path[512]; + char host_escape_file[512]; + char *slash; + + if (snprintf(sysroot_tmp, sizeof(sysroot_tmp), "%s/tmp", + mounted_sysroot_root) >= (int) sizeof(sysroot_tmp) || + snprintf(escape_dir, sizeof(escape_dir), "%s", + host_fallback_path) >= (int) sizeof(escape_dir) || + snprintf(guest_missing_path, sizeof(guest_missing_path), + "/tmp/unlink-escape.txt") >= + (int) sizeof(guest_missing_path)) { + FAIL("unlink symlink escape paths too long"); + } else { + slash = strrchr(escape_dir, '/'); + if (!slash) { + FAIL("host fallback path has no parent directory"); + goto unlink_symlink_escape_done; + } + *slash = '\0'; + if (snprintf(host_escape_file, sizeof(host_escape_file), + "%s/unlink-escape.txt", + escape_dir) >= (int) sizeof(host_escape_file)) { + FAIL("unlink symlink escape paths too long"); + } else if (unlink(sysroot_tmp) < 0 && errno != ENOENT) { + FAIL( + "failed to remove sysroot tmp path before unlink escape " + "test"); + } else if (symlink(escape_dir, sysroot_tmp) < 0) { + FAIL( + "failed to install sysroot tmp symlink for unlink escape " + "test"); + } else if (unlink(guest_missing_path) == 0) { + FAIL( + "unlink unexpectedly succeeded for redirected symlink " + "path"); + } else if (access(host_escape_file, F_OK) == 0) { + FAIL("unlink followed redirected tmp symlink escape"); + } else { + PASS(); + } + } + unlink_symlink_escape_done:; + } + + SUMMARY("test-sysroot-create-paths"); + return fails > 0 ? 1 : 0; +}