From ed584488597fdde18112a935770ffaabd971a272 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Mon, 10 Nov 2025 16:08:47 +0000 Subject: [PATCH 01/12] Merged PR 12878455: Fix for the Rego "metadata desync" bug [cherry-picked from 421b12249544a334e36df33dc4846673b2a88279] This set of changes fixes the [Metadata Desync with UVM State](https://msazure.visualstudio.com/One/_workitems/edit/33232631/) bug, by reverting the Rego policy state on mount and some types of unmount failures. For mounts, a minor cleanup code is added to ensure we close down the dm-crypt device if we fails to mount it. Aside from this, it is relatively straightforward - if we get a failure, the clean up functions will remove the directory, remove any dm-devices, and we will revert the Rego metadata. For unmounts, careful consideration needs to be taken, since if the directory has been unmounted successfully (or even partially successful?), and we get an error, we cannot just revert the policy state, as this may allow the host to use a broken / empty mount as one of the image layer. See 615c9a0bdf's commit message for more detailed thoughts. The solution I opted for is, for non-trivial unmount failure cases (i.e. not policy denial, not invalid mountpoint), if it fails, then we will block all further mount, unmount, container creation and deletion attempts. I think this is OK since we really do not expect unmounts to fail - this is especially the case for us since the only writable disk mount we have is the shared scratch disk, which we do not unmount at all unless we're about to kill the UVM. Testing ------- The "Rollback policy state on mount errors" commit message has some instruction for making a deliberately corrupted (but still passes the verifyinfo extraction) VHD that will cause a mount error. The other way we could make mount / unmount fail, and thus test this change, is by mounting a tmpfs or ro bind in relevant places: To make unmount fail: mkdir /run/gcs/c/.../rootfs/a && mount -t tmpfs none /run/gcs/c/.../rootfs/a or mkdir /run/gcs/mounts/scsi/m1/a && mount -t tmpfs none /run/gcs/mounts/scsi/m1/a To make mount fail: mount -o ro --bind /run/mounts/scsi /run/mounts/scsi or mount --bind -o ro /run/gcs/c /run/gcs/c Once failure is triggered, one can make them work again by `umount`ing the tmpfs or ro bind. What about other operations? ---------------------------- This PR covers mount and unmount of disks, overlays and 9p. Aside from setting `metadata.matches` as part of the narrowing scheme, and except for `metadata.started` to prevent re-using a container ID, Rego does not use persistent state for any other operations. Since it's not clear whether reverting the state would be semantically correct (we would need to carefully consider exactly what are the side effects of say, failing to execute a process, start a container, or send a signal, etc), and adding the revert to those operations does not currently affect much behaviour, I've opted not to apply the metadata revert to those for now. Allow unrecoverable_error.go to build on Windows and fix IsSNP() invocation IsSNP() now can return an error, although this is not expected on LCOW. Signed-off-by: Tingmao Wang --- internal/gcs/unrecoverable_error.go | 53 ++++ internal/guest/runtime/hcsv2/uvm.go | 252 ++++++++++++++-- internal/guest/storage/mount.go | 1 + internal/guest/storage/overlay/overlay.go | 3 +- internal/guest/storage/scsi/scsi.go | 15 +- internal/guest/storage/scsi/scsi_test.go | 6 + .../regopolicyinterpreter.go | 66 ++++- .../regopolicyinterpreter_test.go | 164 +++++++++++ pkg/securitypolicy/rego_utils_test.go | 38 ++- pkg/securitypolicy/regopolicy_linux_test.go | 273 ++++++++++++++++++ pkg/securitypolicy/securitypolicyenforcer.go | 22 ++ .../securitypolicyenforcer_rego.go | 84 ++++++ 12 files changed, 935 insertions(+), 42 deletions(-) create mode 100644 internal/gcs/unrecoverable_error.go diff --git a/internal/gcs/unrecoverable_error.go b/internal/gcs/unrecoverable_error.go new file mode 100644 index 0000000000..96528088aa --- /dev/null +++ b/internal/gcs/unrecoverable_error.go @@ -0,0 +1,53 @@ +package gcs + +import ( + "context" + "fmt" + "os" + "runtime" + "time" + + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/pkg/amdsevsnp" + "github.com/sirupsen/logrus" +) + +// UnrecoverableError logs the error and then puts the current thread into an +// infinite sleep loop. This is to be used instead of panicking, as the +// behaviour of GCS panics is unpredictable. This function can be extended to, +// for example, try to shutdown the VM cleanly. +func UnrecoverableError(err error) { + buf := make([]byte, 300*(1<<10)) + stackSize := runtime.Stack(buf, true) + stackTrace := string(buf[:stackSize]) + + errPrint := fmt.Sprintf( + "Unrecoverable error in GCS: %v\n%s", + err, stackTrace, + ) + + isSnp, err := amdsevsnp.IsSNP() + if err != nil { + // IsSNP() cannot fail on LCOW + // but if it does, we proceed as if we're on SNP to be safe. + isSnp = true + } + + if isSnp { + errPrint += "\nThis thread will now enter an infinite loop." + } + log.G(context.Background()).WithError(err).Logf( + logrus.FatalLevel, + "%s", + errPrint, + ) + + if !isSnp { + panic("Unrecoverable error in GCS: " + err.Error()) + } else { + fmt.Fprintf(os.Stderr, "%s\n", errPrint) + for { + time.Sleep(time.Hour) + } + } +} diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 43669a6f26..572126e54a 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -17,6 +17,7 @@ import ( "regexp" "strings" "sync" + "sync/atomic" "syscall" "time" @@ -116,6 +117,16 @@ type Host struct { // hostMounts keeps the state of currently mounted devices and file systems, // which is used for GCS hardening. hostMounts *hostMounts + // A permanent flag to indicate that further mounts, unmounts and container + // creation should not be allowed. This is set when, because of a failure + // during an unmount operation, we end up in a state where the policy + // enforcer's state is out of sync with what we have actually done, but we + // cannot safely revert its state. + // + // Not used in non-confidential mode. + mountsBroken atomic.Bool + // A user-friendly error message for why mountsBroken was set. + mountsBrokenCausedBy string } func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer securitypolicy.SecurityPolicyEnforcer, logWriter io.Writer) *Host { @@ -135,6 +146,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s devNullTransport: &transport.DevNullTransport{}, hostMounts: newHostMounts(), securityOptions: securityPolicyOptions, + mountsBroken: atomic.Bool{}, } } @@ -386,7 +398,44 @@ func checkContainerSettings(sandboxID, containerID string, settings *prot.VMHost return nil } +// Returns an error if h.mountsBroken is set (and we're in a confidential +// container host) +func (h *Host) checkMountsNotBroken() error { + if h.HasSecurityPolicy() && h.mountsBroken.Load() { + return errors.Errorf( + "Mount, unmount, container creation and deletion has been disabled in this UVM due to a previous error (%q)", + h.mountsBrokenCausedBy, + ) + } + return nil +} + +func (h *Host) setMountsBrokenIfConfidential(cause string) { + if !h.HasSecurityPolicy() { + return + } + h.mountsBroken.Store(true) + h.mountsBrokenCausedBy = cause + log.G(context.Background()).WithFields(logrus.Fields{ + "cause": cause, + }).Error("Host::mountsBroken set to true. All further mounts/unmounts, container creation and deletion will fail.") +} + +func checkExists(path string) (error, bool) { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + return nil, false + } + return errors.Wrapf(err, "failed to determine if path '%s' exists", path), false + } + return nil, true +} + func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { + if err = h.checkMountsNotBroken(); err != nil { + return nil, err + } + criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] // Check for virtual pod annotation @@ -738,6 +787,10 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * case guestresource.ResourceTypeSCSIDevice: return modifySCSIDevice(ctx, req.RequestType, req.Settings.(*guestresource.SCSIDevice)) case guestresource.ResourceTypeMappedVirtualDisk: + if err := h.checkMountsNotBroken(); err != nil { + return err + } + mvd := req.Settings.(*guestresource.LCOWMappedVirtualDisk) // find the actual controller number on the bus and update the incoming request. var cNum uint8 @@ -775,18 +828,30 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * }() } } - return modifyMappedVirtualDisk(ctx, req.RequestType, mvd, h.securityOptions.PolicyEnforcer) + return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd) case guestresource.ResourceTypeMappedDirectory: - return modifyMappedDirectory(ctx, h.vsock, req.RequestType, req.Settings.(*guestresource.LCOWMappedDirectory), h.securityOptions.PolicyEnforcer) + if err := h.checkMountsNotBroken(); err != nil { + return err + } + + return h.modifyMappedDirectory(ctx, h.vsock, req.RequestType, req.Settings.(*guestresource.LCOWMappedDirectory)) case guestresource.ResourceTypeVPMemDevice: - return modifyMappedVPMemDevice(ctx, req.RequestType, req.Settings.(*guestresource.LCOWMappedVPMemDevice), h.securityOptions.PolicyEnforcer) + if err := h.checkMountsNotBroken(); err != nil { + return err + } + + return h.modifyMappedVPMemDevice(ctx, req.RequestType, req.Settings.(*guestresource.LCOWMappedVPMemDevice)) case guestresource.ResourceTypeCombinedLayers: + if err := h.checkMountsNotBroken(); err != nil { + return err + } + cl := req.Settings.(*guestresource.LCOWCombinedLayers) // when cl.ScratchPath == "", we mount overlay as read-only, in which case // we don't really care about scratch encryption, since the host already // knows about the layers and the overlayfs. encryptedScratch := cl.ScratchPath != "" && h.hostMounts.IsEncrypted(cl.ScratchPath) - return modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers), encryptedScratch, h.securityOptions.PolicyEnforcer) + return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers), encryptedScratch) case guestresource.ResourceTypeNetwork: return modifyNetwork(ctx, req.RequestType, req.Settings.(*guestresource.LCOWNetworkAdapter)) case guestresource.ResourceTypeVPCIDevice: @@ -1174,19 +1239,19 @@ func modifySCSIDevice( } } -func modifyMappedVirtualDisk( +func (h *Host) modifyMappedVirtualDisk( ctx context.Context, rt guestrequest.RequestType, mvd *guestresource.LCOWMappedVirtualDisk, - securityPolicy securitypolicy.SecurityPolicyEnforcer, ) (err error) { var verityInfo *guestresource.DeviceVerityInfo + securityPolicy := h.securityOptions.PolicyEnforcer if mvd.ReadOnly { // The only time the policy is empty, and we want it to be empty // is when no policy is provided, and we default to open door // policy. In any other case, e.g. explicit open door or any // other rego policy we would like to mount layers with verity. - if len(securityPolicy.EncodedSecurityPolicy()) > 0 { + if h.HasSecurityPolicy() { devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition) if err != nil { return err @@ -1200,6 +1265,17 @@ func modifyMappedVirtualDisk( } } } + + // For confidential containers, we revert the policy metadata on both mount + // and unmount errors, but if we've actually called Unmount and it fails we + // permanently block further device operations. + var rev securitypolicy.RevertableSectionHandle + rev, err = securityPolicy.StartRevertableSection() + if err != nil { + return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") + } + defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) + switch rt { case guestrequest.RequestTypeAdd: mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) @@ -1227,6 +1303,12 @@ func modifyMappedVirtualDisk( Filesystem: mvd.Filesystem, BlockDev: mvd.BlockDev, } + // Since we're rolling back the policy metadata (via the revertable + // section) on failure, we need to ensure that we have reverted all + // the side effects from this failed mount attempt, otherwise the + // Rego metadata is technically still inconsistent with reality. + // Mount cleans up the created directory and dm devices on failure, + // so we're good. return scsi.Mount(mountCtx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, mvd.ReadOnly, mvd.Options, config) } @@ -1234,14 +1316,33 @@ func modifyMappedVirtualDisk( case guestrequest.RequestTypeRemove: if mvd.MountPath != "" { if mvd.ReadOnly { - if err := securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { + if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) } } else { - if err := securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { + if err = securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) } } + // Check that the directory actually exists first, and if it does + // not then we just refuse to do anything, without closing the dm + // device or setting the mountsBroken flag. Policy metadata is + // still reverted to reflect the fact that we have not done + // anything. + // + // Note: we should not do this check before calling the policy + // enforcer, as otherwise we might inadvertently allow the host to + // find out whether an arbitrary path (which may point to sensitive + // data within a container rootfs) exists or not + if h.HasSecurityPolicy() { + err, exists := checkExists(mvd.MountPath) + if err != nil { + return err + } + if !exists { + return errors.Errorf("unmounting scsi device at %s failed: directory does not exist", mvd.MountPath) + } + } config := &scsi.Config{ Encrypted: mvd.Encrypted, VerityInfo: verityInfo, @@ -1249,8 +1350,11 @@ func modifyMappedVirtualDisk( Filesystem: mvd.Filesystem, BlockDev: mvd.BlockDev, } - if err := scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, - mvd.MountPath, config); err != nil { + err = scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, config) + if err != nil { + h.setMountsBrokenIfConfidential( + fmt.Sprintf("unmounting scsi device at %s failed: %v", mvd.MountPath, err), + ) return err } } @@ -1260,13 +1364,23 @@ func modifyMappedVirtualDisk( } } -func modifyMappedDirectory( +func (h *Host) modifyMappedDirectory( ctx context.Context, vsock transport.Transport, rt guestrequest.RequestType, md *guestresource.LCOWMappedDirectory, - securityPolicy securitypolicy.SecurityPolicyEnforcer, ) (err error) { + securityPolicy := h.securityOptions.PolicyEnforcer + // For confidential containers, we revert the policy metadata on both mount + // and unmount errors, but if we've actually called Unmount and it fails we + // permanently block further device operations. + var rev securitypolicy.RevertableSectionHandle + rev, err = securityPolicy.StartRevertableSection() + if err != nil { + return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") + } + defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) + switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforcePlan9MountPolicy(ctx, md.MountPath) @@ -1274,6 +1388,9 @@ func modifyMappedDirectory( return errors.Wrapf(err, "mounting plan9 device at %s denied by policy", md.MountPath) } + // Similar to the reasoning in modifyMappedVirtualDisk, since we're + // rolling back the policy metadata, plan9.Mount here must clean up + // everything if it fails, which it does do. return plan9.Mount(ctx, vsock, md.MountPath, md.ShareName, uint32(md.Port), md.ReadOnly) case guestrequest.RequestTypeRemove: err = securityPolicy.EnforcePlan9UnmountPolicy(ctx, md.MountPath) @@ -1281,20 +1398,28 @@ func modifyMappedDirectory( return errors.Wrapf(err, "unmounting plan9 device at %s denied by policy", md.MountPath) } - return storage.UnmountPath(ctx, md.MountPath, true) + // Note: storage.UnmountPath is nop if path does not exist. + err = storage.UnmountPath(ctx, md.MountPath, true) + if err != nil { + h.setMountsBrokenIfConfidential( + fmt.Sprintf("unmounting plan9 device at %s failed: %v", md.MountPath, err), + ) + return err + } + return nil default: return newInvalidRequestTypeError(rt) } } -func modifyMappedVPMemDevice(ctx context.Context, +func (h *Host) modifyMappedVPMemDevice(ctx context.Context, rt guestrequest.RequestType, vpd *guestresource.LCOWMappedVPMemDevice, - securityPolicy securitypolicy.SecurityPolicyEnforcer, ) (err error) { var verityInfo *guestresource.DeviceVerityInfo + securityPolicy := h.securityOptions.PolicyEnforcer var deviceHash string - if len(securityPolicy.EncodedSecurityPolicy()) > 0 { + if h.HasSecurityPolicy() { if vpd.MappingInfo != nil { return fmt.Errorf("multi mapping is not supported with verity") } @@ -1304,6 +1429,17 @@ func modifyMappedVPMemDevice(ctx context.Context, } deviceHash = verityInfo.RootDigest } + + // For confidential containers, we revert the policy metadata on both mount + // and unmount errors, but if we've actually called Unmount and it fails we + // permanently block further device operations. + var rev securitypolicy.RevertableSectionHandle + rev, err = securityPolicy.StartRevertableSection() + if err != nil { + return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") + } + defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) + switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforceDeviceMountPolicy(ctx, vpd.MountPath, deviceHash) @@ -1311,13 +1447,39 @@ func modifyMappedVPMemDevice(ctx context.Context, return errors.Wrapf(err, "mounting pmem device %d onto %s denied by policy", vpd.DeviceNumber, vpd.MountPath) } + // Similar to the reasoning in modifyMappedVirtualDisk, since we're + // rolling back the policy metadata, pmem.Mount here must clean up + // everything if it fails, which it does do. return pmem.Mount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) case guestrequest.RequestTypeRemove: - if err := securityPolicy.EnforceDeviceUnmountPolicy(ctx, vpd.MountPath); err != nil { + if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, vpd.MountPath); err != nil { return errors.Wrapf(err, "unmounting pmem device from %s denied by policy", vpd.MountPath) } - return pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) + // Check that the directory actually exists first, and if it does not + // then we just refuse to do anything, without closing the dm-linear or + // dm-verity device or setting the mountsBroken flag. + // + // Similar to the reasoning in modifyMappedVirtualDisk, we should not do + // this check before calling the policy enforcer. + if h.HasSecurityPolicy() { + err, exists := checkExists(vpd.MountPath) + if err != nil { + return err + } + if !exists { + return errors.Errorf("unmounting pmem device at %s failed: directory does not exist", vpd.MountPath) + } + } + + err = pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) + if err != nil { + h.setMountsBrokenIfConfidential( + fmt.Sprintf("unmounting pmem device at %s failed: %v", vpd.MountPath, err), + ) + return err + } + return nil default: return newInvalidRequestTypeError(rt) } @@ -1332,19 +1494,28 @@ func modifyMappedVPCIDevice(ctx context.Context, rt guestrequest.RequestType, vp } } -func modifyCombinedLayers( +func (h *Host) modifyCombinedLayers( ctx context.Context, rt guestrequest.RequestType, cl *guestresource.LCOWCombinedLayers, scratchEncrypted bool, - securityPolicy securitypolicy.SecurityPolicyEnforcer, ) (err error) { - isConfidential := len(securityPolicy.EncodedSecurityPolicy()) > 0 + securityPolicy := h.securityOptions.PolicyEnforcer containerID := cl.ContainerID + // For confidential containers, we revert the policy metadata on both mount + // and unmount errors, but if we've actually called Unmount and it fails we + // permanently block further device operations. + var rev securitypolicy.RevertableSectionHandle + rev, err = securityPolicy.StartRevertableSection() + if err != nil { + return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") + } + defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) + switch rt { case guestrequest.RequestTypeAdd: - if isConfidential { + if h.HasSecurityPolicy() { if err := checkValidContainerID(containerID, "container"); err != nil { return err } @@ -1399,15 +1570,27 @@ func modifyCombinedLayers( return fmt.Errorf("overlay creation denied by policy: %w", err) } + // Correctness for policy revertable section: + // MountLayer does two things - mkdir, then mount. On mount failure, the + // target directory is cleaned up. Therefore we're clean in terms of + // side effects. return overlay.MountLayer(ctx, layerPaths, upperdirPath, workdirPath, cl.ContainerRootPath, readonly) case guestrequest.RequestTypeRemove: // cl.ContainerID is not set on remove requests, but rego checks that we can // only umount previously mounted targets anyway - if err := securityPolicy.EnforceOverlayUnmountPolicy(ctx, cl.ContainerRootPath); err != nil { + if err = securityPolicy.EnforceOverlayUnmountPolicy(ctx, cl.ContainerRootPath); err != nil { return errors.Wrap(err, "overlay removal denied by policy") } - return storage.UnmountPath(ctx, cl.ContainerRootPath, true) + // Note: storage.UnmountPath is a no-op if the path does not exist. + err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) + if err != nil { + h.setMountsBrokenIfConfidential( + fmt.Sprintf("unmounting overlay at %s failed: %v", cl.ContainerRootPath, err), + ) + return err + } + return nil default: return newInvalidRequestTypeError(rt) } @@ -1535,3 +1718,22 @@ func (h *Host) createContainerInPod(sandboxID string, containerID string) error return nil } + +// If *err is not nil, the section is rolled back, otherwise it is committed. +func (h *Host) commitOrRollbackPolicyRevSection( + ctx context.Context, + rev securitypolicy.RevertableSectionHandle, + err *error, +) { + if !h.HasSecurityPolicy() { + // Don't produce bogus log entries if we aren't in confidential mode, + // even though rev.Rollback would have been no-op. + return + } + if *err != nil { + rev.Rollback() + logrus.WithContext(ctx).WithError(*err).Warn("rolling back security policy revertable section due to error") + } else { + rev.Commit() + } +} diff --git a/internal/guest/storage/mount.go b/internal/guest/storage/mount.go index 15664dd693..9b7b946b88 100644 --- a/internal/guest/storage/mount.go +++ b/internal/guest/storage/mount.go @@ -127,6 +127,7 @@ func UnmountPath(ctx context.Context, target string, removeTarget bool) (err err if _, err := osStat(target); err != nil { if os.IsNotExist(err) { + log.G(ctx).WithField("target", target).Warnf("UnmountPath called for non-existent path") return nil } return errors.Wrapf(err, "failed to determine if path '%s' exists", target) diff --git a/internal/guest/storage/overlay/overlay.go b/internal/guest/storage/overlay/overlay.go index aa4877508f..84bf8fa529 100644 --- a/internal/guest/storage/overlay/overlay.go +++ b/internal/guest/storage/overlay/overlay.go @@ -56,8 +56,7 @@ func processErrNoSpace(ctx context.Context, path string, err error) { }).WithError(err).Warn("got ENOSPC, gathering diagnostics") } -// MountLayer first enforces the security policy for the container's layer paths -// and then calls Mount to mount the layer paths as an overlayfs. +// MountLayer calls Mount to mount the layer paths as an overlayfs. func MountLayer( ctx context.Context, layerPaths []string, diff --git a/internal/guest/storage/scsi/scsi.go b/internal/guest/storage/scsi/scsi.go index ec62636590..83c586c3eb 100644 --- a/internal/guest/storage/scsi/scsi.go +++ b/internal/guest/storage/scsi/scsi.go @@ -121,8 +121,9 @@ type Config struct { // Mount creates a mount from the SCSI device on `controller` index `lun` to // `target` // -// `target` will be created. On mount failure the created `target` will be -// automatically cleaned up. +// `target` will be created. On mount failure the created `target`, as well as +// any associated dm-crypt or dm-verify devices will be automatically cleaned +// up. // // If the config has `encrypted` is set to true, the SCSI device will be // encrypted using dm-crypt. @@ -200,7 +201,8 @@ func Mount( var deviceFS string if config.Encrypted { cryptDeviceName := fmt.Sprintf(cryptDeviceFmt, controller, lun, partition) - encryptedSource, err := encryptDevice(spnCtx, source, cryptDeviceName) + var encryptedSource string + encryptedSource, err = encryptDevice(spnCtx, source, cryptDeviceName) if err != nil { // todo (maksiman): add better retry logic, similar to how SCSI device mounts are // retried on unix.ENOENT and unix.ENXIO. The retry should probably be on an @@ -211,6 +213,13 @@ func Mount( } } source = encryptedSource + defer func() { + if err != nil { + if err := cleanupCryptDevice(spnCtx, cryptDeviceName); err != nil { + log.G(spnCtx).WithError(err).WithField("cryptDeviceName", cryptDeviceName).Debug("failed to cleanup dm-crypt device after mount failure") + } + } + }() } else { // Get the filesystem that is already on the device (if any) and use that // as the mountType unless `Filesystem` was given. diff --git a/internal/guest/storage/scsi/scsi_test.go b/internal/guest/storage/scsi/scsi_test.go index ebfcf8e382..94992047bd 100644 --- a/internal/guest/storage/scsi/scsi_test.go +++ b/internal/guest/storage/scsi/scsi_test.go @@ -999,6 +999,12 @@ func Test_Mount_EncryptDevice_Mkfs_Error(t *testing.T) { } return expectedDevicePath, nil } + cleanupCryptDevice = func(_ context.Context, dmCryptName string) error { + if dmCryptName != expectedCryptTarget { + t.Fatalf("expected cleanupCryptDevice name %q got %q", expectedCryptTarget, dmCryptName) + } + return nil + } osStat = osStatNoop xfsFormat = func(arg string) error { diff --git a/internal/regopolicyinterpreter/regopolicyinterpreter.go b/internal/regopolicyinterpreter/regopolicyinterpreter.go index 66f62c5114..6e316f9b41 100644 --- a/internal/regopolicyinterpreter/regopolicyinterpreter.go +++ b/internal/regopolicyinterpreter/regopolicyinterpreter.go @@ -63,6 +63,9 @@ type RegoModule struct { type regoMetadata map[string]map[string]interface{} +const metadataRootKey = "metadata" +const metadataOperationsKey = "metadata" + type regoMetadataAction string const ( @@ -81,6 +84,11 @@ type regoMetadataOperation struct { // The result from a policy query type RegoQueryResult map[string]interface{} +// An immutable, saved copy of the metadata state. +type SavedMetadata struct { + metadataRoot regoMetadata +} + // deep copy for an object func copyObject(data map[string]interface{}) (map[string]interface{}, error) { objJSON, err := json.Marshal(data) @@ -113,6 +121,24 @@ func copyValue(value interface{}) (interface{}, error) { return valueCopy, nil } +// deep copy for regoMetadata. +// We cannot use copyObject for this due to the fact that map[string]interface{} +// is a concrete type and a map of it cannot be used as a map of interface{}. +func copyRegoMetadata(value regoMetadata) (regoMetadata, error) { + valueJSON, err := json.Marshal(value) + if err != nil { + return nil, err + } + + var valueCopy regoMetadata + err = json.Unmarshal(valueJSON, &valueCopy) + if err != nil { + return nil, err + } + + return valueCopy, nil +} + // NewRegoPolicyInterpreter creates a new RegoPolicyInterpreter, using the code provided. // inputData is the Rego data which should be used as the initial state // of the interpreter. A deep copy is performed on it such that it will @@ -123,8 +149,8 @@ func NewRegoPolicyInterpreter(code string, inputData map[string]interface{}) (*R return nil, fmt.Errorf("unable to copy the input data: %w", err) } - if _, ok := data["metadata"]; !ok { - data["metadata"] = make(regoMetadata) + if _, ok := data[metadataRootKey]; !ok { + data[metadataRootKey] = make(regoMetadata) } policy := &RegoPolicyInterpreter{ @@ -207,7 +233,7 @@ func (r *RegoPolicyInterpreter) GetMetadata(name string, key string) (interface{ r.dataAndModulesMutex.Lock() defer r.dataAndModulesMutex.Unlock() - metadataRoot, ok := r.data["metadata"].(regoMetadata) + metadataRoot, ok := r.data[metadataRootKey].(regoMetadata) if !ok { return nil, errors.New("illegal interpreter state: invalid metadata object type") } @@ -228,6 +254,32 @@ func (r *RegoPolicyInterpreter) GetMetadata(name string, key string) (interface{ } } +// Saves a copy of the internal policy metadata state. +func (r *RegoPolicyInterpreter) SaveMetadata() (s SavedMetadata, err error) { + r.dataAndModulesMutex.Lock() + defer r.dataAndModulesMutex.Unlock() + + metadataRoot, ok := r.data[metadataRootKey].(regoMetadata) + if !ok { + return SavedMetadata{}, errors.New("illegal interpreter state: invalid metadata object type") + } + s.metadataRoot, err = copyRegoMetadata(metadataRoot) + return s, err +} + +// Restores a previously saved metadata state. +func (r *RegoPolicyInterpreter) RestoreMetadata(m SavedMetadata) error { + r.dataAndModulesMutex.Lock() + defer r.dataAndModulesMutex.Unlock() + + copied, err := copyRegoMetadata(m.metadataRoot) + if err != nil { + return fmt.Errorf("unable to copy metadata: %w", err) + } + r.data[metadataRootKey] = copied + return nil +} + func newRegoMetadataOperation(operation interface{}) (*regoMetadataOperation, error) { var metadataOp regoMetadataOperation @@ -286,7 +338,7 @@ func (r *RegoPolicyInterpreter) UpdateOSType(os string) error { func (r *RegoPolicyInterpreter) updateMetadata(ops []*regoMetadataOperation) error { // dataAndModulesMutex must be held before calling this - metadataRoot, ok := r.data["metadata"].(regoMetadata) + metadataRoot, ok := r.data[metadataRootKey].(regoMetadata) if !ok { return errors.New("illegal interpreter state: invalid metadata object type") } @@ -431,7 +483,7 @@ func (r *RegoPolicyInterpreter) logMetadata() { return } - contents, err := json.Marshal(r.data["metadata"]) + contents, err := json.Marshal(r.data[metadataRootKey]) if err != nil { r.metadataLogger.Printf("error marshaling metadata: %v\n", err.Error()) } else { @@ -637,7 +689,7 @@ func (r *RegoPolicyInterpreter) Query(rule string, input map[string]interface{}) r.logResult(rule, resultSet) ops := []*regoMetadataOperation{} - if rawMetadata, ok := resultSet["metadata"]; ok { + if rawMetadata, ok := resultSet[metadataOperationsKey]; ok { metadata, ok := rawMetadata.([]interface{}) if !ok { return nil, errors.New("error loading metadata array: invalid type") @@ -660,7 +712,7 @@ func (r *RegoPolicyInterpreter) Query(rule string, input map[string]interface{}) } for name, value := range resultSet { - if name == "metadata" { + if name == metadataOperationsKey { continue } else { result[name] = value diff --git a/internal/regopolicyinterpreter/regopolicyinterpreter_test.go b/internal/regopolicyinterpreter/regopolicyinterpreter_test.go index b7d86609f7..3872afff51 100644 --- a/internal/regopolicyinterpreter/regopolicyinterpreter_test.go +++ b/internal/regopolicyinterpreter/regopolicyinterpreter_test.go @@ -72,6 +72,37 @@ func Test_copyValue(t *testing.T) { } } +func Test_copyRegoMetadata(t *testing.T) { + f := func(orig testRegoMetadata) bool { + copy, err := copyRegoMetadata(regoMetadata(orig)) + if err != nil { + t.Error(err) + return false + } + + if len(orig) != len(copy) { + t.Errorf("original and copy have different number of objects: %d != %d", len(orig), len(copy)) + return false + } + + for name, origObject := range orig { + if copyObject, ok := copy[name]; ok { + if !assertObjectsEqual(origObject, copyObject) { + t.Errorf("original and copy differ on key %s", name) + } + } else { + t.Errorf("copy missing object %s", name) + } + } + + return true + } + + if err := quick.Check(f, &quick.Config{MaxCount: 30, Rand: testRand}); err != nil { + t.Errorf("Test_copyRegoMetadata: %v", err) + } +} + //go:embed test.rego var testCode string @@ -364,6 +395,107 @@ func Test_Metadata_Remove(t *testing.T) { } } +func Test_Metadata_SaveRestore(t *testing.T) { + rego, err := setupRego() + if err != nil { + t.Fatal(err) + } + + f := func(pairs1before, pairs1after intPairArray, name1 metadataName, pairs2before, pairs2after intPairArray, name2 metadataName) bool { + if name1 == name2 { + t.Fatalf("generated two identical names: %s", name1) + } + + err := appendAll(rego, pairs1before, name1) + if err != nil { + t.Errorf("error appending pairs1before: %v", err) + return false + } + err = appendAll(rego, pairs2before, name2) + if err != nil { + t.Errorf("error appending pairs2before: %v", err) + return false + } + + saved, err := rego.SaveMetadata() + if err != nil { + t.Errorf("unable to save metadata: %v", err) + return false + } + + beforeSum1 := getExpectedGapFromPairs(pairs1before) + err = computeGap(rego, name1, beforeSum1) + if err != nil { + t.Error(err) + return false + } + + beforeSum2 := getExpectedGapFromPairs(pairs2before) + err = computeGap(rego, name2, beforeSum2) + if err != nil { + t.Error(err) + return false + } + + // computeGap would have cleared the list, so we restore it. + err = rego.RestoreMetadata(saved) + if err != nil { + t.Errorf("unable to restore metadata: %v", err) + return false + } + + err = appendAll(rego, pairs1after, name1) + if err != nil { + t.Errorf("error appending pairs1after: %v", err) + return false + } + + err = appendAll(rego, pairs2after, name2) + if err != nil { + t.Errorf("error appending pairs2after: %v", err) + return false + } + + afterSum1 := beforeSum1 + getExpectedGapFromPairs(pairs1after) + err = computeGap(rego, name1, afterSum1) + if err != nil { + t.Error(err) + return false + } + + afterSum2 := beforeSum2 + getExpectedGapFromPairs(pairs2after) + err = computeGap(rego, name2, afterSum2) + if err != nil { + t.Error(err) + return false + } + + err = rego.RestoreMetadata(saved) + if err != nil { + t.Errorf("unable to restore metadata: %v", err) + return false + } + + err = computeGap(rego, name1, beforeSum1) + if err != nil { + t.Errorf("computeGap failed for name1 after restore: %v", err) + return false + } + + err = computeGap(rego, name2, beforeSum2) + if err != nil { + t.Errorf("computeGap failed for name2 after restore: %v", err) + return false + } + + return true + } + + if err := quick.Check(f, &quick.Config{MaxCount: 100, Rand: testRand}); err != nil { + t.Errorf("Test_Metadata_SaveRestore: %v", err) + } +} + //go:embed module.rego var moduleCode string @@ -508,6 +640,7 @@ type testValue struct { } type testArray []interface{} type testObject map[string]interface{} +type testRegoMetadata regoMetadata type testValueType int @@ -580,6 +713,16 @@ func (testObject) Generate(r *rand.Rand, _ int) reflect.Value { return reflect.ValueOf(value) } +func (testRegoMetadata) Generate(r *rand.Rand, _ int) reflect.Value { + numObjects := r.Intn(maxNumberOfFields) + metadata := make(testRegoMetadata) + for i := 0; i < numObjects; i++ { + name := uniqueString(r) + metadata[name] = generateObject(r, 0) + } + return reflect.ValueOf(metadata) +} + func getResult(r *RegoPolicyInterpreter, p intPair, rule string) (RegoQueryResult, error) { input := map[string]interface{}{"a": p.a, "b": p.b} result, err := r.Query("data.test."+rule, input) @@ -640,6 +783,27 @@ func appendLists(r *RegoPolicyInterpreter, p intPair, name metadataName) error { return nil } +func appendAll(r *RegoPolicyInterpreter, pairs intPairArray, name metadataName) error { + for _, pair := range pairs { + if err := appendLists(r, pair, name); err != nil { + return fmt.Errorf("error appending pair %v: %w", pair, err) + } + } + return nil +} + +func getExpectedGapFromPairs(pairs intPairArray) int { + expected := 0 + for _, pair := range pairs { + if pair.a >= pair.b { + expected += pair.a - pair.b + } else { + expected += pair.b - pair.a + } + } + return expected +} + func computeGap(r *RegoPolicyInterpreter, name metadataName, expected int) error { input := map[string]interface{}{"name": string(name)} result, err := r.Query("data.test.compute_gap", input) diff --git a/pkg/securitypolicy/rego_utils_test.go b/pkg/securitypolicy/rego_utils_test.go index 2967b5a6b7..bce0232f52 100644 --- a/pkg/securitypolicy/rego_utils_test.go +++ b/pkg/securitypolicy/rego_utils_test.go @@ -347,18 +347,25 @@ type regoPlan9MountTestConfig struct { } func mountImageForContainer(policy *regoEnforcer, container *securityPolicyContainer) (string, error) { - ctx := context.Background() containerID := testDataGenerator.uniqueContainerID() + if err := mountImageForContainerWithID(policy, container, containerID); err != nil { + return "", err + } + return containerID, nil +} + +func mountImageForContainerWithID(policy *regoEnforcer, container *securityPolicyContainer, containerID string) error { + ctx := context.Background() layerPaths, err := testDataGenerator.createValidOverlayForContainer(policy, container) if err != nil { - return "", fmt.Errorf("error creating valid overlay: %w", err) + return fmt.Errorf("error creating valid overlay: %w", err) } scratchDisk := getScratchDiskMountTarget(containerID) err = policy.EnforceRWDeviceMountPolicy(ctx, scratchDisk, true, true, "xfs") if err != nil { - return "", fmt.Errorf("error mounting scratch disk: %w", err) + return fmt.Errorf("error mounting scratch disk: %w", err) } overlayTarget := getOverlayMountTarget(containerID) @@ -367,12 +374,13 @@ func mountImageForContainer(policy *regoEnforcer, container *securityPolicyConta err = policy.EnforceOverlayMountPolicy( ctx, containerID, copyStrings(layerPaths), overlayTarget) if err != nil { - return "", fmt.Errorf("error mounting filesystem: %w", err) + return fmt.Errorf("error mounting filesystem: %w", err) } - return containerID, nil + return nil } + func buildMountSpecFromMountArray(mounts []mountInternal, sandboxID string, r *rand.Rand) *oci.Spec { mountSpec := new(oci.Spec) @@ -1404,6 +1412,10 @@ func setupRegoCreateContainerTest(gc *generatedConstraints, testContainer *secur return nil, err } + return createTestContainerSpec(gc, containerID, testContainer, privilegedError, policy, defaultMounts, privilegedMounts) +} + +func createTestContainerSpec(gc *generatedConstraints, containerID string, testContainer *securityPolicyContainer, privilegedError bool, policy *regoEnforcer, defaultMounts, privilegedMounts []mountInternal) (*regoContainerTestConfig, error) { envList := buildEnvironmentVariablesFromEnvRules(testContainer.EnvRules, testRand) sandboxID := testDataGenerator.uniqueSandboxID() @@ -2999,3 +3011,19 @@ type containerInitProcess struct { WorkingDir string AllowStdioAccess bool } + +func startRevertableSection(t *testing.T, policy *regoEnforcer) RevertableSectionHandle { + rev, err := policy.StartRevertableSection() + if err != nil { + t.Fatalf("Failed to start revertable section: %v", err) + } + return rev +} + +func commitOrRollback(rev RevertableSectionHandle, shouldCommit bool) { + if shouldCommit { + rev.Commit() + } else { + rev.Rollback() + } +} diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 8dd409fccf..0ddd933bac 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -963,6 +963,90 @@ func Test_Rego_EnforceOverlayMountPolicy_Multiple_Instances_Same_Container(t *te } } +func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { + f := func(gc *generatedConstraints, commitOnEnforcementFailure bool) bool { + securityPolicy := gc.toPolicy() + policy, err := newRegoPolicy(securityPolicy.marshalRego(), []oci.Mount{}, []oci.Mount{}, testOSType) + if err != nil { + t.Errorf("cannot make rego policy from constraints: %v", err) + return false + } + tc := selectContainerFromContainerList(gc.containers, testRand) + tid := testDataGenerator.uniqueContainerID() + scratchTarget := getScratchDiskMountTarget(tid) + + rev := startRevertableSection(t, policy) + err = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchTarget, true, true, "xfs") + if err != nil { + t.Errorf("failed to EnforceRWDeviceMountPolicy: %v", err) + return false + } + rev.Commit() + + layerToErr := testRand.Intn(len(tc.Layers)) + errLayerPathIndex := len(tc.Layers) - layerToErr - 1 + layerPaths := make([]string, len(tc.Layers)) + for i, layerHash := range tc.Layers { + rev := startRevertableSection(t, policy) + target := testDataGenerator.uniqueLayerMountTarget() + layerPaths[len(tc.Layers)-i-1] = target + err = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) + if err != nil { + t.Errorf("failed to EnforceDeviceMountPolicy: %v", err) + return false + } + if i == layerToErr { + // Simulate a mount failure at this point, which will cause us to rollback + rev.Rollback() + } else { + rev.Commit() + } + } + + rev = startRevertableSection(t, policy) + overlayTarget := getOverlayMountTarget(tid) + err = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) + if !assertDecisionJSONContains(t, err, append(slices.Clone(layerPaths), "no matching containers for overlay")...) { + return false + } + commitOrRollback(rev, commitOnEnforcementFailure) + + layerPathsWithoutErr := make([]string, 0) + for i, layerPath := range layerPaths { + if i != errLayerPathIndex { + layerPathsWithoutErr = append(layerPathsWithoutErr, layerPath) + } + } + + rev = startRevertableSection(t, policy) + err = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPathsWithoutErr, overlayTarget) + if !assertDecisionJSONContains(t, err, append(slices.Clone(layerPathsWithoutErr), "no matching containers for overlay")...) { + return false + } + commitOrRollback(rev, commitOnEnforcementFailure) + + retryTarget := layerPaths[errLayerPathIndex] + rev = startRevertableSection(t, policy) + err = policy.EnforceDeviceMountPolicy(gc.ctx, retryTarget, tc.Layers[layerToErr]) + if err != nil { + t.Errorf("failed to EnforceDeviceMountPolicy again after one previous reverted failure: %v", err) + return false + } + rev.Commit() + err = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) + if err != nil { + t.Errorf("failed to EnforceOverlayMountPolicy after one previous reverted failure: %v", err) + return false + } + + return true + } + + if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil { + t.Errorf("Test_Rego_EnforceOverlayMountPolicy_MountFail: %v", err) + } +} + func Test_Rego_EnforceOverlayUnmountPolicy(t *testing.T) { f := func(p *generatedConstraints) bool { tc, err := setupRegoOverlayTest(p, true) @@ -6103,6 +6187,195 @@ func Test_Rego_Enforce_CreateContainer_RequiredEnvMissingHasErrorMessage(t *test } } +func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { + f := func(gc *generatedConstraints, commitOnEnforcementFailure bool) bool { + container := selectContainerFromContainerList(gc.containers, testRand) + securityPolicy := gc.toPolicy() + defaultMounts := generateMounts(testRand) + privilegedMounts := generateMounts(testRand) + + policy, err := newRegoPolicy(securityPolicy.marshalRego(), + toOCIMounts(defaultMounts), + toOCIMounts(privilegedMounts), testOSType) + if err != nil { + t.Errorf("cannot make rego policy from constraints: %v", err) + return false + } + + containerID := testDataGenerator.uniqueContainerID() + tc, err := createTestContainerSpec(gc, containerID, container, false, policy, defaultMounts, privilegedMounts) + if err != nil { + t.Fatal(err) + } + + layers, err := testDataGenerator.createValidOverlayForContainer(policy, container) + if err != nil { + t.Errorf("Failed to createValidOverlayForContainer: %v", err) + return false + } + + scratchMountTarget := getScratchDiskMountTarget(containerID) + rev := startRevertableSection(t, policy) + err = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") + if err != nil { + t.Errorf("Failed to EnforceRWDeviceMountPolicy: %v", err) + return false + } + rev.Commit() + + rev = startRevertableSection(t, policy) + overlayTarget := getOverlayMountTarget(containerID) + err = policy.EnforceOverlayMountPolicy(gc.ctx, containerID, layers, overlayTarget) + if err != nil { + t.Errorf("Failed to EnforceOverlayMountPolicy: %v", err) + return false + } + // Simulate a failure by rolling back the overlay mount + rev.Rollback() + + rev = startRevertableSection(t, policy) + _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + if err == nil { + t.Errorf("EnforceCreateContainerPolicy should have failed due to missing (reverted) overlay mount") + return false + } + commitOrRollback(rev, commitOnEnforcementFailure) + + // "Retry" overlay mount + rev = startRevertableSection(t, policy) + err = policy.EnforceOverlayMountPolicy(gc.ctx, tc.containerID, layers, overlayTarget) + if err != nil { + t.Errorf("Failed to EnforceOverlayMountPolicy: %v", err) + return false + } + rev.Commit() + + rev = startRevertableSection(t, policy) + _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + if err != nil { + t.Errorf("Failed to EnforceCreateContainerPolicy after retrying overlay mount: %v", err) + return false + } + rev.Commit() + + return true + } + + if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil { + t.Errorf("Test_Rego_EnforceCreateContainerPolicy_RejectRevertedOverlayMount: %v", err) + } +} + +func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { + f := func(gc *generatedConstraints, + newContainerID, failScratchMount, testDenyInvalidContainerCreation bool, + ) bool { + container := selectContainerFromContainerList(gc.containers, testRand) + securityPolicy := gc.toPolicy() + defaultMounts := generateMounts(testRand) + privilegedMounts := generateMounts(testRand) + + policy, err := newRegoPolicy(securityPolicy.marshalRego(), + toOCIMounts(defaultMounts), + toOCIMounts(privilegedMounts), testOSType) + if err != nil { + t.Errorf("cannot make rego policy from constraints: %v", err) + return false + } + + containerID := testDataGenerator.uniqueContainerID() + tc, err := createTestContainerSpec(gc, containerID, container, false, policy, defaultMounts, privilegedMounts) + if err != nil { + t.Fatal(err) + } + + scratchMountTarget := getScratchDiskMountTarget(containerID) + rev := startRevertableSection(t, policy) + err = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") + if err != nil { + t.Errorf("Failed to EnforceRWDeviceMountPolicy: %v", err) + return false + } + + succeedLayerPaths := make([]string, 0) + + if failScratchMount { + rev.Rollback() + } else { + rev.Commit() + + // Simulate one of the layers failing to mount, after which the outside + // gives up on this container and starts over. + layerToErr := testRand.Intn(len(container.Layers)) + for i, layerHash := range container.Layers { + rev := startRevertableSection(t, policy) + target := testDataGenerator.uniqueLayerMountTarget() + err = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) + if err != nil { + t.Errorf("failed to EnforceDeviceMountPolicy: %v", err) + return false + } + if i == layerToErr { + // Simulate a mount failure at this point, which will cause us to rollback + rev.Rollback() + break + } else { + rev.Commit() + succeedLayerPaths = append(succeedLayerPaths, target) + } + } + + for _, layerPath := range succeedLayerPaths { + rev := startRevertableSection(t, policy) + err = policy.EnforceDeviceUnmountPolicy(gc.ctx, layerPath) + if err != nil { + t.Errorf("Failed to EnforceDeviceUnmountPolicy: %v", err) + return false + } + rev.Commit() + } + + rev = startRevertableSection(t, policy) + err = policy.EnforceRWDeviceUnmountPolicy(gc.ctx, scratchMountTarget) + if err != nil { + t.Errorf("Failed to EnforceRWDeviceUnmountPolicy: %v", err) + return false + } + rev.Commit() + } + + if testDenyInvalidContainerCreation { + rev = startRevertableSection(t, policy) + _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + if err == nil { + t.Errorf("EnforceCreateContainerPolicy should have failed due to missing (reverted) overlay mount") + } + rev.Rollback() + } + + if newContainerID { + tc.containerID = testDataGenerator.uniqueContainerID() + } + + err = mountImageForContainerWithID(policy, container, tc.containerID) + if err != nil { + t.Errorf("Failed to mount image for container after reverting and retrying: %v", err) + return false + } + _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + if err != nil { + t.Errorf("Failed to EnforceCreateContainerPolicy after retrying: %v", err) + return false + } + + return true + } + + if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil { + t.Errorf("Test_Rego_EnforceCreateContainerPolicy_RejectRevertedOverlayMount: %v", err) + } +} + func Test_Rego_ExecInContainerPolicy_RequiredEnvMissingHasErrorMessage(t *testing.T) { constraints := generateConstraints(testRand, 1) container := selectContainerFromContainerList(constraints.containers, testRand) diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 2a4edefce1..8a4457ce22 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -57,6 +57,14 @@ func init() { registeredEnforcers[openDoorEnforcerName] = createOpenDoorEnforcer } +// Represents an in-progress revertable section. To ensure state is consistent, +// Commit() and Rollback() must not fail, so they do not return anything, and if +// an error does occur they should panic. +type RevertableSectionHandle interface { + Commit() + Rollback() +} + type SecurityPolicyEnforcer interface { EnforceDeviceMountPolicy(ctx context.Context, target string, deviceHash string) (err error) EnforceRWDeviceMountPolicy(ctx context.Context, target string, encrypted, ensureFilesystem bool, filesystem string) (err error) @@ -128,6 +136,7 @@ type SecurityPolicyEnforcer interface { GetUserInfo(spec *oci.Process, rootPath string) (IDName, []IDName, string, error) EnforceVerifiedCIMsPolicy(ctx context.Context, containerID string, layerHashes []string, mountedCim []string) (err error) EnforceRegistryChangesPolicy(ctx context.Context, containerID string, registryValues interface{}) error + StartRevertableSection() (RevertableSectionHandle, error) } //nolint:unused @@ -182,6 +191,11 @@ func CreateSecurityPolicyEnforcer( } } +type nopRevertableSectionHandle struct{} + +func (nopRevertableSectionHandle) Commit() {} +func (nopRevertableSectionHandle) Rollback() {} + type OpenDoorSecurityPolicyEnforcer struct { encodedSecurityPolicy string } @@ -324,6 +338,10 @@ func (OpenDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context.C return nil } +func (*OpenDoorSecurityPolicyEnforcer) StartRevertableSection() (RevertableSectionHandle, error) { + return nopRevertableSectionHandle{}, nil +} + type ClosedDoorSecurityPolicyEnforcer struct{} var _ SecurityPolicyEnforcer = (*ClosedDoorSecurityPolicyEnforcer)(nil) @@ -452,3 +470,7 @@ func (ClosedDoorSecurityPolicyEnforcer) EnforceVerifiedCIMsPolicy(ctx context.Co func (ClosedDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context.Context, containerID string, registryValues interface{}) error { return errors.New("registry changes are denied by policy") } + +func (*ClosedDoorSecurityPolicyEnforcer) StartRevertableSection() (RevertableSectionHandle, error) { + return nopRevertableSectionHandle{}, nil +} diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 96c5613dd6..1997ff48f9 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -12,8 +12,10 @@ import ( "regexp" "slices" "strings" + "sync" "syscall" + "github.com/Microsoft/hcsshim/internal/gcs" "github.com/Microsoft/hcsshim/internal/guestpath" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -58,6 +60,10 @@ type regoEnforcer struct { maxErrorMessageLength int // OS type osType string + // Mutex to ensure only one revertable section is active + revertableSectionLock sync.Mutex + // Saved metadata for the revertable section + savedMetadata rpi.SavedMetadata } var _ SecurityPolicyEnforcer = (*regoEnforcer)(nil) @@ -1191,3 +1197,81 @@ func (policy *regoEnforcer) EnforceRegistryChangesPolicy(ctx context.Context, co func (policy *regoEnforcer) GetUserInfo(process *oci.Process, rootPath string) (IDName, []IDName, string, error) { return GetAllUserInfo(process, rootPath) } + +type revertableSectionHandle struct { + // policy is cleared once this struct is "used", to prevent accidental + // duplicate Commit/Rollback calls. + policy *regoEnforcer +} + +func (policy *regoEnforcer) inRevertableSection() bool { + succ := policy.revertableSectionLock.TryLock() + if succ { + // since nobody else has the lock, we're not in fact in a revertable + // section. + policy.revertableSectionLock.Unlock() + return false + } + // somebody else (i.e. the caller) has the lock, so we're in a revertable + // section. Don't unlock it here! + return true +} + +// Starts a revertable section by saving the current policy state. If another +// revertable section is already active, this will wait until that one is +// finished. +func (policy *regoEnforcer) StartRevertableSection() (RevertableSectionHandle, error) { + policy.revertableSectionLock.Lock() + var err error + policy.savedMetadata, err = policy.rego.SaveMetadata() + if err != nil { + err = errors.Wrapf(err, "unable to save metadata for revertable section") + policy.revertableSectionLock.Unlock() + return &revertableSectionHandle{}, err + } + // Keep policy.revertableSectionLock locked until the end of the section. + sh := &revertableSectionHandle{ + policy: policy, + } + return sh, nil +} + +func (sh *revertableSectionHandle) Commit() { + if sh.policy == nil { + gcs.UnrecoverableError(errors.New("revertable section handle already used")) + } + + policy := sh.policy + sh.policy = nil + lockSucc := policy.revertableSectionLock.TryLock() + if lockSucc { + gcs.UnrecoverableError(errors.New("not in a revertable section")) + } else { + // somebody else (i.e. the caller) has the lock, so we're in a revertable + // section. Clear the saved metadata just in case, then unlock to exit the + // section. + policy.savedMetadata = rpi.SavedMetadata{} + policy.revertableSectionLock.Unlock() + } +} + +func (sh *revertableSectionHandle) Rollback() { + if sh.policy == nil { + gcs.UnrecoverableError(errors.New("revertable section handle already used")) + } + + policy := sh.policy + sh.policy = nil + lockSucc := policy.revertableSectionLock.TryLock() + if lockSucc { + gcs.UnrecoverableError(errors.New("not in a revertable section")) + } else { + // somebody else (i.e. the caller) has the lock, so we're in a revertable + // section. Restore the saved metadata, then unlock to exit the section. + err := policy.rego.RestoreMetadata(policy.savedMetadata) + if err != nil { + gcs.UnrecoverableError(errors.Wrap(err, "unable to restore metadata for revertable section")) + } + policy.revertableSectionLock.Unlock() + } +} From 9f98a710c62598a12b0a6030740d7594f40dc03b Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Thu, 4 Jun 2026 00:36:25 +0000 Subject: [PATCH 02/12] Fix copilot review comments from blockdev mounts PR Addresses the following comments: https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091150 https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091204 https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091229 https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091293 https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091326 https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091352 Assisted-by: GitHub Copilot:claude-opus-4.7 copilot-review Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 2 +- internal/regopolicyinterpreter/regopolicyinterpreter_test.go | 2 ++ pkg/securitypolicy/securitypolicyenforcer.go | 2 +- pkg/securitypolicy/securitypolicyenforcer_rego.go | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 572126e54a..3586ecf56e 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -403,7 +403,7 @@ func checkContainerSettings(sandboxID, containerID string, settings *prot.VMHost func (h *Host) checkMountsNotBroken() error { if h.HasSecurityPolicy() && h.mountsBroken.Load() { return errors.Errorf( - "Mount, unmount, container creation and deletion has been disabled in this UVM due to a previous error (%q)", + "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error (%q)", h.mountsBrokenCausedBy, ) } diff --git a/internal/regopolicyinterpreter/regopolicyinterpreter_test.go b/internal/regopolicyinterpreter/regopolicyinterpreter_test.go index 3872afff51..534b87e892 100644 --- a/internal/regopolicyinterpreter/regopolicyinterpreter_test.go +++ b/internal/regopolicyinterpreter/regopolicyinterpreter_test.go @@ -89,9 +89,11 @@ func Test_copyRegoMetadata(t *testing.T) { if copyObject, ok := copy[name]; ok { if !assertObjectsEqual(origObject, copyObject) { t.Errorf("original and copy differ on key %s", name) + return false } } else { t.Errorf("copy missing object %s", name) + return false } } diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 8a4457ce22..6f431429c2 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -59,7 +59,7 @@ func init() { // Represents an in-progress revertable section. To ensure state is consistent, // Commit() and Rollback() must not fail, so they do not return anything, and if -// an error does occur they should panic. +// an error does occur they should panic or trigger an unrecoverable error. type RevertableSectionHandle interface { Commit() Rollback() diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 1997ff48f9..0463231487 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -1227,7 +1227,7 @@ func (policy *regoEnforcer) StartRevertableSection() (RevertableSectionHandle, e if err != nil { err = errors.Wrapf(err, "unable to save metadata for revertable section") policy.revertableSectionLock.Unlock() - return &revertableSectionHandle{}, err + return nil, err } // Keep policy.revertableSectionLock locked until the end of the section. sh := &revertableSectionHandle{ From 36c9cd77e723a0073e1df12e16fdc865b71ff1e1 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 9 Jun 2026 10:40:43 +0000 Subject: [PATCH 03/12] Address some comments from Maksim Assisted-by: GitHub Copilot:auto copilot-review Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 3586ecf56e..385ab04206 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -117,11 +117,11 @@ type Host struct { // hostMounts keeps the state of currently mounted devices and file systems, // which is used for GCS hardening. hostMounts *hostMounts - // A permanent flag to indicate that further mounts, unmounts and container - // creation should not be allowed. This is set when, because of a failure - // during an unmount operation, we end up in a state where the policy - // enforcer's state is out of sync with what we have actually done, but we - // cannot safely revert its state. + // mountsBroken is a permanent flag to indicate that further mounts, + // unmounts and container creation should not be allowed. This is set when, + // because of a failure during an unmount operation, we end up in a state + // where the policy enforcer's state is out of sync with what we have + // actually done, but we cannot safely revert its state. // // Not used in non-confidential mode. mountsBroken atomic.Bool @@ -414,21 +414,22 @@ func (h *Host) setMountsBrokenIfConfidential(cause string) { if !h.HasSecurityPolicy() { return } - h.mountsBroken.Store(true) + // write cause before atomic store of the bool below so that cause is never nil when the flag is set. h.mountsBrokenCausedBy = cause + h.mountsBroken.Store(true) log.G(context.Background()).WithFields(logrus.Fields{ "cause": cause, }).Error("Host::mountsBroken set to true. All further mounts/unmounts, container creation and deletion will fail.") } -func checkExists(path string) (error, bool) { +func checkExists(path string) (bool, error) { if _, err := os.Stat(path); err != nil { if os.IsNotExist(err) { - return nil, false + return false, nil } - return errors.Wrapf(err, "failed to determine if path '%s' exists", path), false + return false, errors.Wrapf(err, "failed to determine if path '%s' exists", path) } - return nil, true + return true, nil } func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { @@ -1335,7 +1336,7 @@ func (h *Host) modifyMappedVirtualDisk( // find out whether an arbitrary path (which may point to sensitive // data within a container rootfs) exists or not if h.HasSecurityPolicy() { - err, exists := checkExists(mvd.MountPath) + exists, err := checkExists(mvd.MountPath) if err != nil { return err } @@ -1463,7 +1464,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, // Similar to the reasoning in modifyMappedVirtualDisk, we should not do // this check before calling the policy enforcer. if h.HasSecurityPolicy() { - err, exists := checkExists(vpd.MountPath) + exists, err := checkExists(vpd.MountPath) if err != nil { return err } From 0bcb77b1b1a2d77635c2d510a9c5faf8bcda7b59 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 9 Jun 2026 12:17:58 +0000 Subject: [PATCH 04/12] Address mountsBroken refactor suggestion from Maksim Assisted-by: GitHub Copilot:auto copilot-review Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 95 ++++++++++++++++++----------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 385ab04206..78f1b1ccef 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -17,7 +17,6 @@ import ( "regexp" "strings" "sync" - "sync/atomic" "syscall" "time" @@ -117,16 +116,44 @@ type Host struct { // hostMounts keeps the state of currently mounted devices and file systems, // which is used for GCS hardening. hostMounts *hostMounts - // mountsBroken is a permanent flag to indicate that further mounts, - // unmounts and container creation should not be allowed. This is set when, - // because of a failure during an unmount operation, we end up in a state - // where the policy enforcer's state is out of sync with what we have - // actually done, but we cannot safely revert its state. + // uvmError contains a permanent flag to indicate that further mounts, + // unmounts and container creation / deletion should not be allowed. This + // is set when, because of a failure during an unmount operation, we end up + // in a state where the policy enforcer's state is out of sync with what we + // have actually done, but we cannot safely revert its state. // - // Not used in non-confidential mode. - mountsBroken atomic.Bool - // A user-friendly error message for why mountsBroken was set. - mountsBrokenCausedBy string + // Only consulted in confidential mode (see Host.checkState). + uvmError uvmConsistencyError +} + +type uvmConsistencyError struct { + mu sync.Mutex + // A user-friendly error message for why the UVM has entered an inconsistent + // state. If this is empty, there is no error and Check() returns nil. + cause string +} + +// Mark that the UVM has entered an inconsistent state, and store the cause if +// it is not already set. +func (u *uvmConsistencyError) Set(cause string) { + u.mu.Lock() + defer u.mu.Unlock() + if u.cause == "" { + u.cause = cause + } +} + +// Check returns a non-nil error if the UVM has been marked inconsistent. +func (u *uvmConsistencyError) Check() error { + u.mu.Lock() + defer u.mu.Unlock() + if u.cause != "" { + return errors.Errorf( + "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error (%q)", + u.cause, + ) + } + return nil } func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer securitypolicy.SecurityPolicyEnforcer, logWriter io.Writer) *Host { @@ -146,7 +173,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s devNullTransport: &transport.DevNullTransport{}, hostMounts: newHostMounts(), securityOptions: securityPolicyOptions, - mountsBroken: atomic.Bool{}, + uvmError: uvmConsistencyError{}, } } @@ -398,28 +425,26 @@ func checkContainerSettings(sandboxID, containerID string, settings *prot.VMHost return nil } -// Returns an error if h.mountsBroken is set (and we're in a confidential -// container host) -func (h *Host) checkMountsNotBroken() error { - if h.HasSecurityPolicy() && h.mountsBroken.Load() { - return errors.Errorf( - "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error (%q)", - h.mountsBrokenCausedBy, - ) +// checkState returns an error if the UVM has entered an inconsistent state from +// which it cannot safely recover. Only enforced for confidential containers. +func (h *Host) checkState() error { + if h.HasSecurityPolicy() { + return h.uvmError.Check() } return nil } -func (h *Host) setMountsBrokenIfConfidential(cause string) { +// setUVMInconsistent records that the UVM has entered an inconsistent state and +// logs the cause. The flag is only consulted in confidential mode (see +// checkState), so this is a no-op for non-confidential hosts. +func (h *Host) setUVMInconsistent(cause string) { if !h.HasSecurityPolicy() { return } - // write cause before atomic store of the bool below so that cause is never nil when the flag is set. - h.mountsBrokenCausedBy = cause - h.mountsBroken.Store(true) + h.uvmError.Set(cause) log.G(context.Background()).WithFields(logrus.Fields{ "cause": cause, - }).Error("Host::mountsBroken set to true. All further mounts/unmounts, container creation and deletion will fail.") + }).Error("Host marked inconsistent. All further mounts/unmounts, container creation and deletion will fail.") } func checkExists(path string) (bool, error) { @@ -433,7 +458,7 @@ func checkExists(path string) (bool, error) { } func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { - if err = h.checkMountsNotBroken(); err != nil { + if err = h.checkState(); err != nil { return nil, err } @@ -788,7 +813,7 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * case guestresource.ResourceTypeSCSIDevice: return modifySCSIDevice(ctx, req.RequestType, req.Settings.(*guestresource.SCSIDevice)) case guestresource.ResourceTypeMappedVirtualDisk: - if err := h.checkMountsNotBroken(); err != nil { + if err := h.checkState(); err != nil { return err } @@ -831,19 +856,19 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * } return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd) case guestresource.ResourceTypeMappedDirectory: - if err := h.checkMountsNotBroken(); err != nil { + if err := h.checkState(); err != nil { return err } return h.modifyMappedDirectory(ctx, h.vsock, req.RequestType, req.Settings.(*guestresource.LCOWMappedDirectory)) case guestresource.ResourceTypeVPMemDevice: - if err := h.checkMountsNotBroken(); err != nil { + if err := h.checkState(); err != nil { return err } return h.modifyMappedVPMemDevice(ctx, req.RequestType, req.Settings.(*guestresource.LCOWMappedVPMemDevice)) case guestresource.ResourceTypeCombinedLayers: - if err := h.checkMountsNotBroken(); err != nil { + if err := h.checkState(); err != nil { return err } @@ -1327,7 +1352,7 @@ func (h *Host) modifyMappedVirtualDisk( } // Check that the directory actually exists first, and if it does // not then we just refuse to do anything, without closing the dm - // device or setting the mountsBroken flag. Policy metadata is + // device or marking the UVM inconsistent. Policy metadata is // still reverted to reflect the fact that we have not done // anything. // @@ -1353,7 +1378,7 @@ func (h *Host) modifyMappedVirtualDisk( } err = scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, config) if err != nil { - h.setMountsBrokenIfConfidential( + h.setUVMInconsistent( fmt.Sprintf("unmounting scsi device at %s failed: %v", mvd.MountPath, err), ) return err @@ -1402,7 +1427,7 @@ func (h *Host) modifyMappedDirectory( // Note: storage.UnmountPath is nop if path does not exist. err = storage.UnmountPath(ctx, md.MountPath, true) if err != nil { - h.setMountsBrokenIfConfidential( + h.setUVMInconsistent( fmt.Sprintf("unmounting plan9 device at %s failed: %v", md.MountPath, err), ) return err @@ -1459,7 +1484,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, // Check that the directory actually exists first, and if it does not // then we just refuse to do anything, without closing the dm-linear or - // dm-verity device or setting the mountsBroken flag. + // dm-verity device or marking the UVM inconsistent. // // Similar to the reasoning in modifyMappedVirtualDisk, we should not do // this check before calling the policy enforcer. @@ -1475,7 +1500,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, err = pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) if err != nil { - h.setMountsBrokenIfConfidential( + h.setUVMInconsistent( fmt.Sprintf("unmounting pmem device at %s failed: %v", vpd.MountPath, err), ) return err @@ -1586,7 +1611,7 @@ func (h *Host) modifyCombinedLayers( // Note: storage.UnmountPath is a no-op if the path does not exist. err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) if err != nil { - h.setMountsBrokenIfConfidential( + h.setUVMInconsistent( fmt.Sprintf("unmounting overlay at %s failed: %v", cl.ContainerRootPath, err), ) return err From 09dca88cc0365289a3eea3126d65e0970fa63456 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 9 Jun 2026 12:23:53 +0000 Subject: [PATCH 05/12] Remove unused function inRevertableSection Assisted-by: GitHub Copilot:auto copilot-review Signed-off-by: Tingmao Wang --- pkg/securitypolicy/securitypolicyenforcer_rego.go | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 0463231487..3aa8c46fc0 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -1204,19 +1204,6 @@ type revertableSectionHandle struct { policy *regoEnforcer } -func (policy *regoEnforcer) inRevertableSection() bool { - succ := policy.revertableSectionLock.TryLock() - if succ { - // since nobody else has the lock, we're not in fact in a revertable - // section. - policy.revertableSectionLock.Unlock() - return false - } - // somebody else (i.e. the caller) has the lock, so we're in a revertable - // section. Don't unlock it here! - return true -} - // Starts a revertable section by saving the current policy state. If another // revertable section is already active, this will wait until that one is // finished. From 67fabcdabcdf3c83c0ffe2936b5e8259bce56c68 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 9 Jun 2026 13:42:57 +0000 Subject: [PATCH 06/12] Implement revertable section to closure-based transaction refactor suggestion from Maksim Rebase done with Copilot without much manual review. Assisted-by: GitHub Copilot:claude-opus-4.8 copilot-review Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 500 ++++++++---------- pkg/securitypolicy/rego_utils_test.go | 17 - pkg/securitypolicy/regopolicy_linux_test.go | 198 ++++--- pkg/securitypolicy/securitypolicyenforcer.go | 23 +- .../securitypolicyenforcer_rego.go | 82 +-- 5 files changed, 386 insertions(+), 434 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 78f1b1ccef..bce8e7aebe 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -1292,102 +1292,99 @@ func (h *Host) modifyMappedVirtualDisk( } } - // For confidential containers, we revert the policy metadata on both mount - // and unmount errors, but if we've actually called Unmount and it fails we - // permanently block further device operations. - var rev securitypolicy.RevertableSectionHandle - rev, err = securityPolicy.StartRevertableSection() - if err != nil { - return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") - } - defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) - - switch rt { - case guestrequest.RequestTypeAdd: - mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) - defer cancel() - if mvd.MountPath != "" { - if mvd.ReadOnly { - var deviceHash string - if verityInfo != nil { - deviceHash = verityInfo.RootDigest - } - err = securityPolicy.EnforceDeviceMountPolicy(ctx, mvd.MountPath, deviceHash) - if err != nil { - return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) + // For confidential containers, we revert the policy metadata via the + // transaction rollback mechanism on both mount and unmount errors, but if + // we've actually called Unmount and it fails we permanently block further + // device operations by marking the UVM state as inconsistent. + return securityPolicy.WithTransaction(func() error { + switch rt { + case guestrequest.RequestTypeAdd: + mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) + defer cancel() + if mvd.MountPath != "" { + if mvd.ReadOnly { + var deviceHash string + if verityInfo != nil { + deviceHash = verityInfo.RootDigest + } + err = securityPolicy.EnforceDeviceMountPolicy(ctx, mvd.MountPath, deviceHash) + if err != nil { + return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) + } + } else { + err = securityPolicy.EnforceRWDeviceMountPolicy(ctx, mvd.MountPath, mvd.Encrypted, mvd.EnsureFilesystem, mvd.Filesystem) + if err != nil { + return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) + } } - } else { - err = securityPolicy.EnforceRWDeviceMountPolicy(ctx, mvd.MountPath, mvd.Encrypted, mvd.EnsureFilesystem, mvd.Filesystem) - if err != nil { - return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) + config := &scsi.Config{ + Encrypted: mvd.Encrypted, + VerityInfo: verityInfo, + EnsureFilesystem: mvd.EnsureFilesystem, + Filesystem: mvd.Filesystem, + BlockDev: mvd.BlockDev, } + // Since we're rolling back the policy metadata on failure, we + // need to ensure that we have reverted all the side effects + // from this failed mount attempt, otherwise the Rego metadata + // is technically still inconsistent with reality. Mount cleans + // up the created directory and dm devices on failure, so we're + // good. + return scsi.Mount(mountCtx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, + mvd.ReadOnly, mvd.Options, config) } - config := &scsi.Config{ - Encrypted: mvd.Encrypted, - VerityInfo: verityInfo, - EnsureFilesystem: mvd.EnsureFilesystem, - Filesystem: mvd.Filesystem, - BlockDev: mvd.BlockDev, - } - // Since we're rolling back the policy metadata (via the revertable - // section) on failure, we need to ensure that we have reverted all - // the side effects from this failed mount attempt, otherwise the - // Rego metadata is technically still inconsistent with reality. - // Mount cleans up the created directory and dm devices on failure, - // so we're good. - return scsi.Mount(mountCtx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, - mvd.ReadOnly, mvd.Options, config) - } - return nil - case guestrequest.RequestTypeRemove: - if mvd.MountPath != "" { - if mvd.ReadOnly { - if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { - return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) + return nil + case guestrequest.RequestTypeRemove: + if mvd.MountPath != "" { + if mvd.ReadOnly { + if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { + return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) + } + } else { + if err = securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { + return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) + } } - } else { - if err = securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { - return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) + // Check that the directory actually exists first, and if it + // does not then we just refuse to do anything, without closing + // the dm device or marking the UVM inconsistent. Policy + // metadata is still reverted to reflect the fact that we have + // not done anything. + // + // Note: we should not do this check before calling the policy + // enforcer (which we have done above), otherwise we will + // inadvertently allow the host to find out whether an arbitrary + // path (which may point to sensitive data within a container + // rootfs) exists or not + if h.HasSecurityPolicy() { + exists, err := checkExists(mvd.MountPath) + if err != nil { + return err + } + if !exists { + return errors.Errorf("unmounting scsi device at %s failed: directory does not exist", mvd.MountPath) + } } - } - // Check that the directory actually exists first, and if it does - // not then we just refuse to do anything, without closing the dm - // device or marking the UVM inconsistent. Policy metadata is - // still reverted to reflect the fact that we have not done - // anything. - // - // Note: we should not do this check before calling the policy - // enforcer, as otherwise we might inadvertently allow the host to - // find out whether an arbitrary path (which may point to sensitive - // data within a container rootfs) exists or not - if h.HasSecurityPolicy() { - exists, err := checkExists(mvd.MountPath) + config := &scsi.Config{ + Encrypted: mvd.Encrypted, + VerityInfo: verityInfo, + EnsureFilesystem: mvd.EnsureFilesystem, + Filesystem: mvd.Filesystem, + BlockDev: mvd.BlockDev, + } + err = scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, config) if err != nil { + h.setUVMInconsistent( + fmt.Sprintf("unmounting scsi device at %s failed: %v", mvd.MountPath, err), + ) return err } - if !exists { - return errors.Errorf("unmounting scsi device at %s failed: directory does not exist", mvd.MountPath) - } - } - config := &scsi.Config{ - Encrypted: mvd.Encrypted, - VerityInfo: verityInfo, - EnsureFilesystem: mvd.EnsureFilesystem, - Filesystem: mvd.Filesystem, - BlockDev: mvd.BlockDev, - } - err = scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, config) - if err != nil { - h.setUVMInconsistent( - fmt.Sprintf("unmounting scsi device at %s failed: %v", mvd.MountPath, err), - ) - return err } + return nil + default: + return newInvalidRequestTypeError(rt) } - return nil - default: - return newInvalidRequestTypeError(rt) - } + }) } func (h *Host) modifyMappedDirectory( @@ -1397,45 +1394,41 @@ func (h *Host) modifyMappedDirectory( md *guestresource.LCOWMappedDirectory, ) (err error) { securityPolicy := h.securityOptions.PolicyEnforcer - // For confidential containers, we revert the policy metadata on both mount - // and unmount errors, but if we've actually called Unmount and it fails we - // permanently block further device operations. - var rev securitypolicy.RevertableSectionHandle - rev, err = securityPolicy.StartRevertableSection() - if err != nil { - return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") - } - defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) - - switch rt { - case guestrequest.RequestTypeAdd: - err = securityPolicy.EnforcePlan9MountPolicy(ctx, md.MountPath) - if err != nil { - return errors.Wrapf(err, "mounting plan9 device at %s denied by policy", md.MountPath) - } + // For confidential containers, we revert the policy metadata via the + // transaction rollback mechanism on both mount and unmount errors, but if + // we've actually called Unmount and it fails we permanently block further + // device operations. + return securityPolicy.WithTransaction(func() error { + switch rt { + case guestrequest.RequestTypeAdd: + err = securityPolicy.EnforcePlan9MountPolicy(ctx, md.MountPath) + if err != nil { + return errors.Wrapf(err, "mounting plan9 device at %s denied by policy", md.MountPath) + } - // Similar to the reasoning in modifyMappedVirtualDisk, since we're - // rolling back the policy metadata, plan9.Mount here must clean up - // everything if it fails, which it does do. - return plan9.Mount(ctx, vsock, md.MountPath, md.ShareName, uint32(md.Port), md.ReadOnly) - case guestrequest.RequestTypeRemove: - err = securityPolicy.EnforcePlan9UnmountPolicy(ctx, md.MountPath) - if err != nil { - return errors.Wrapf(err, "unmounting plan9 device at %s denied by policy", md.MountPath) - } + // Similar to the reasoning in modifyMappedVirtualDisk, since we're + // rolling back the policy metadata, plan9.Mount here must clean up + // everything if it fails, which it does do. + return plan9.Mount(ctx, vsock, md.MountPath, md.ShareName, uint32(md.Port), md.ReadOnly) + case guestrequest.RequestTypeRemove: + err = securityPolicy.EnforcePlan9UnmountPolicy(ctx, md.MountPath) + if err != nil { + return errors.Wrapf(err, "unmounting plan9 device at %s denied by policy", md.MountPath) + } - // Note: storage.UnmountPath is nop if path does not exist. - err = storage.UnmountPath(ctx, md.MountPath, true) - if err != nil { - h.setUVMInconsistent( - fmt.Sprintf("unmounting plan9 device at %s failed: %v", md.MountPath, err), - ) - return err + // Note: storage.UnmountPath is nop if path does not exist. + err = storage.UnmountPath(ctx, md.MountPath, true) + if err != nil { + h.setUVMInconsistent( + fmt.Sprintf("unmounting plan9 device at %s failed: %v", md.MountPath, err), + ) + return err + } + return nil + default: + return newInvalidRequestTypeError(rt) } - return nil - default: - return newInvalidRequestTypeError(rt) - } + }) } func (h *Host) modifyMappedVPMemDevice(ctx context.Context, @@ -1456,59 +1449,55 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, deviceHash = verityInfo.RootDigest } - // For confidential containers, we revert the policy metadata on both mount - // and unmount errors, but if we've actually called Unmount and it fails we - // permanently block further device operations. - var rev securitypolicy.RevertableSectionHandle - rev, err = securityPolicy.StartRevertableSection() - if err != nil { - return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") - } - defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) + // For confidential containers, we revert the policy metadata via the + // transaction rollback mechanism on both mount and unmount errors, but if + // we've actually called Unmount and it fails we permanently block further + // device operations. + return securityPolicy.WithTransaction(func() error { + switch rt { + case guestrequest.RequestTypeAdd: + err = securityPolicy.EnforceDeviceMountPolicy(ctx, vpd.MountPath, deviceHash) + if err != nil { + return errors.Wrapf(err, "mounting pmem device %d onto %s denied by policy", vpd.DeviceNumber, vpd.MountPath) + } - switch rt { - case guestrequest.RequestTypeAdd: - err = securityPolicy.EnforceDeviceMountPolicy(ctx, vpd.MountPath, deviceHash) - if err != nil { - return errors.Wrapf(err, "mounting pmem device %d onto %s denied by policy", vpd.DeviceNumber, vpd.MountPath) - } + // Similar to the reasoning in modifyMappedVirtualDisk, since we're + // rolling back the policy metadata, pmem.Mount here must clean up + // everything if it fails, which it does do. + return pmem.Mount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) + case guestrequest.RequestTypeRemove: + if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, vpd.MountPath); err != nil { + return errors.Wrapf(err, "unmounting pmem device from %s denied by policy", vpd.MountPath) + } - // Similar to the reasoning in modifyMappedVirtualDisk, since we're - // rolling back the policy metadata, pmem.Mount here must clean up - // everything if it fails, which it does do. - return pmem.Mount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) - case guestrequest.RequestTypeRemove: - if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, vpd.MountPath); err != nil { - return errors.Wrapf(err, "unmounting pmem device from %s denied by policy", vpd.MountPath) - } + // Check that the directory actually exists first, and if it does not + // then we just refuse to do anything, without closing the dm-linear or + // dm-verity device or marking the UVM inconsistent. + // + // Similar to the reasoning in modifyMappedVirtualDisk, we should not do + // this check before calling the policy enforcer. + if h.HasSecurityPolicy() { + exists, err := checkExists(vpd.MountPath) + if err != nil { + return err + } + if !exists { + return errors.Errorf("unmounting pmem device at %s failed: directory does not exist", vpd.MountPath) + } + } - // Check that the directory actually exists first, and if it does not - // then we just refuse to do anything, without closing the dm-linear or - // dm-verity device or marking the UVM inconsistent. - // - // Similar to the reasoning in modifyMappedVirtualDisk, we should not do - // this check before calling the policy enforcer. - if h.HasSecurityPolicy() { - exists, err := checkExists(vpd.MountPath) + err = pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) if err != nil { + h.setUVMInconsistent( + fmt.Sprintf("unmounting pmem device at %s failed: %v", vpd.MountPath, err), + ) return err } - if !exists { - return errors.Errorf("unmounting pmem device at %s failed: directory does not exist", vpd.MountPath) - } - } - - err = pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) - if err != nil { - h.setUVMInconsistent( - fmt.Sprintf("unmounting pmem device at %s failed: %v", vpd.MountPath, err), - ) - return err + return nil + default: + return newInvalidRequestTypeError(rt) } - return nil - default: - return newInvalidRequestTypeError(rt) - } + }) } func modifyMappedVPCIDevice(ctx context.Context, rt guestrequest.RequestType, vpciDev *guestresource.LCOWMappedVPCIDevice) error { @@ -1529,97 +1518,93 @@ func (h *Host) modifyCombinedLayers( securityPolicy := h.securityOptions.PolicyEnforcer containerID := cl.ContainerID - // For confidential containers, we revert the policy metadata on both mount - // and unmount errors, but if we've actually called Unmount and it fails we - // permanently block further device operations. - var rev securitypolicy.RevertableSectionHandle - rev, err = securityPolicy.StartRevertableSection() - if err != nil { - return errors.Wrapf(err, "failed to start revertable section on security policy enforcer") - } - defer h.commitOrRollbackPolicyRevSection(ctx, rev, &err) - - switch rt { - case guestrequest.RequestTypeAdd: - if h.HasSecurityPolicy() { - if err := checkValidContainerID(containerID, "container"); err != nil { - return err - } + // For confidential containers, we revert the policy metadata via the + // transaction rollback mechanism on both mount and unmount errors, but if + // we've actually called Unmount and it fails we permanently block further + // device operations. + return securityPolicy.WithTransaction(func() error { + switch rt { + case guestrequest.RequestTypeAdd: + if h.HasSecurityPolicy() { + if err := checkValidContainerID(containerID, "container"); err != nil { + return err + } - // We check this regardless of what the policy says, as long as we're in - // confidential mode. This matches with checkContainerSettings called for - // container creation request. - expectedContainerRootfs := path.Join(guestpath.LCOWRootPrefixInUVM, containerID, guestpath.RootfsPath) - if cl.ContainerRootPath != expectedContainerRootfs { - return fmt.Errorf("combined layers target %q does not match expected path %q", - cl.ContainerRootPath, expectedContainerRootfs) - } + // We check this regardless of what the policy says, as long as we're in + // confidential mode. This matches with checkContainerSettings called for + // container creation request. + expectedContainerRootfs := path.Join(guestpath.LCOWRootPrefixInUVM, containerID, guestpath.RootfsPath) + if cl.ContainerRootPath != expectedContainerRootfs { + return fmt.Errorf("combined layers target %q does not match expected path %q", + cl.ContainerRootPath, expectedContainerRootfs) + } - if cl.ScratchPath != "" { - // At this point, we do not know what the sandbox ID would be yet, so we - // have to allow anything reasonable. - scratchDirRegexStr := fmt.Sprintf( - "^%s/%s/%s/%s$", - guestpath.LCOWRootPrefixInUVM, - validContainerIDRegexRaw, - guestpath.ScratchDir, - containerID, - ) - scratchDirRegex := regexp.MustCompile(scratchDirRegexStr) - if !scratchDirRegex.MatchString(cl.ScratchPath) { - return fmt.Errorf("scratch path %q must match regex %q", - cl.ScratchPath, scratchDirRegexStr) + if cl.ScratchPath != "" { + // At this point, we do not know what the sandbox ID would be yet, so we + // have to allow anything reasonable. + scratchDirRegexStr := fmt.Sprintf( + "^%s/%s/%s/%s$", + guestpath.LCOWRootPrefixInUVM, + validContainerIDRegexRaw, + guestpath.ScratchDir, + containerID, + ) + scratchDirRegex := regexp.MustCompile(scratchDirRegexStr) + if !scratchDirRegex.MatchString(cl.ScratchPath) { + return fmt.Errorf("scratch path %q must match regex %q", + cl.ScratchPath, scratchDirRegexStr) + } } } - } - layerPaths := make([]string, len(cl.Layers)) - for i, layer := range cl.Layers { - layerPaths[i] = layer.Path - } + layerPaths := make([]string, len(cl.Layers)) + for i, layer := range cl.Layers { + layerPaths[i] = layer.Path + } - var upperdirPath string - var workdirPath string - readonly := false - if cl.ScratchPath == "" { - // The user did not pass a scratch path. Mount overlay as readonly. - readonly = true - } else { - upperdirPath = filepath.Join(cl.ScratchPath, "upper") - workdirPath = filepath.Join(cl.ScratchPath, "work") + var upperdirPath string + var workdirPath string + readonly := false + if cl.ScratchPath == "" { + // The user did not pass a scratch path. Mount overlay as readonly. + readonly = true + } else { + upperdirPath = filepath.Join(cl.ScratchPath, "upper") + workdirPath = filepath.Join(cl.ScratchPath, "work") - if err := securityPolicy.EnforceScratchMountPolicy(ctx, cl.ScratchPath, scratchEncrypted); err != nil { - return fmt.Errorf("scratch mounting denied by policy: %w", err) + if err := securityPolicy.EnforceScratchMountPolicy(ctx, cl.ScratchPath, scratchEncrypted); err != nil { + return fmt.Errorf("scratch mounting denied by policy: %w", err) + } } - } - if err := securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil { - return fmt.Errorf("overlay creation denied by policy: %w", err) - } + if err := securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil { + return fmt.Errorf("overlay creation denied by policy: %w", err) + } - // Correctness for policy revertable section: - // MountLayer does two things - mkdir, then mount. On mount failure, the - // target directory is cleaned up. Therefore we're clean in terms of - // side effects. - return overlay.MountLayer(ctx, layerPaths, upperdirPath, workdirPath, cl.ContainerRootPath, readonly) - case guestrequest.RequestTypeRemove: - // cl.ContainerID is not set on remove requests, but rego checks that we can - // only umount previously mounted targets anyway - if err = securityPolicy.EnforceOverlayUnmountPolicy(ctx, cl.ContainerRootPath); err != nil { - return errors.Wrap(err, "overlay removal denied by policy") - } + // Correctness for policy transaction rollback: + // MountLayer does two things - mkdir, then mount. On mount failure, the + // target directory is cleaned up. Therefore we're clean in terms of + // side effects. + return overlay.MountLayer(ctx, layerPaths, upperdirPath, workdirPath, cl.ContainerRootPath, readonly) + case guestrequest.RequestTypeRemove: + // cl.ContainerID is not set on remove requests, but rego checks that we can + // only umount previously mounted targets anyway + if err = securityPolicy.EnforceOverlayUnmountPolicy(ctx, cl.ContainerRootPath); err != nil { + return errors.Wrap(err, "overlay removal denied by policy") + } - // Note: storage.UnmountPath is a no-op if the path does not exist. - err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) - if err != nil { - h.setUVMInconsistent( - fmt.Sprintf("unmounting overlay at %s failed: %v", cl.ContainerRootPath, err), - ) - return err + // Note: storage.UnmountPath is a no-op if the path does not exist. + err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) + if err != nil { + h.setUVMInconsistent( + fmt.Sprintf("unmounting overlay at %s failed: %v", cl.ContainerRootPath, err), + ) + return err + } + return nil + default: + return newInvalidRequestTypeError(rt) } - return nil - default: - return newInvalidRequestTypeError(rt) - } + }) } func modifyNetwork(ctx context.Context, rt guestrequest.RequestType, na *guestresource.LCOWNetworkAdapter) (err error) { @@ -1744,22 +1729,3 @@ func (h *Host) createContainerInPod(sandboxID string, containerID string) error return nil } - -// If *err is not nil, the section is rolled back, otherwise it is committed. -func (h *Host) commitOrRollbackPolicyRevSection( - ctx context.Context, - rev securitypolicy.RevertableSectionHandle, - err *error, -) { - if !h.HasSecurityPolicy() { - // Don't produce bogus log entries if we aren't in confidential mode, - // even though rev.Rollback would have been no-op. - return - } - if *err != nil { - rev.Rollback() - logrus.WithContext(ctx).WithError(*err).Warn("rolling back security policy revertable section due to error") - } else { - rev.Commit() - } -} diff --git a/pkg/securitypolicy/rego_utils_test.go b/pkg/securitypolicy/rego_utils_test.go index bce0232f52..6afe7de6cc 100644 --- a/pkg/securitypolicy/rego_utils_test.go +++ b/pkg/securitypolicy/rego_utils_test.go @@ -380,7 +380,6 @@ func mountImageForContainerWithID(policy *regoEnforcer, container *securityPolic return nil } - func buildMountSpecFromMountArray(mounts []mountInternal, sandboxID string, r *rand.Rand) *oci.Spec { mountSpec := new(oci.Spec) @@ -3011,19 +3010,3 @@ type containerInitProcess struct { WorkingDir string AllowStdioAccess bool } - -func startRevertableSection(t *testing.T, policy *regoEnforcer) RevertableSectionHandle { - rev, err := policy.StartRevertableSection() - if err != nil { - t.Fatalf("Failed to start revertable section: %v", err) - } - return rev -} - -func commitOrRollback(rev RevertableSectionHandle, shouldCommit bool) { - if shouldCommit { - rev.Commit() - } else { - rev.Rollback() - } -} diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 0ddd933bac..3bf07f75ad 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -6,6 +6,7 @@ package securitypolicy import ( "context" "encoding/json" + "errors" "fmt" "math/rand" "os" @@ -975,41 +976,56 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { tid := testDataGenerator.uniqueContainerID() scratchTarget := getScratchDiskMountTarget(tid) - rev := startRevertableSection(t, policy) - err = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchTarget, true, true, "xfs") + errSimulatedFailure := errors.New("simulated failure") + + err = policy.WithTransaction(func() error { + return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchTarget, true, true, "xfs") + }) if err != nil { t.Errorf("failed to EnforceRWDeviceMountPolicy: %v", err) return false } - rev.Commit() layerToErr := testRand.Intn(len(tc.Layers)) errLayerPathIndex := len(tc.Layers) - layerToErr - 1 layerPaths := make([]string, len(tc.Layers)) for i, layerHash := range tc.Layers { - rev := startRevertableSection(t, policy) target := testDataGenerator.uniqueLayerMountTarget() layerPaths[len(tc.Layers)-i-1] = target - err = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) - if err != nil { - t.Errorf("failed to EnforceDeviceMountPolicy: %v", err) + var policyErr error + err = policy.WithTransaction(func() error { + policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) + if policyErr != nil { + return policyErr + } + if i == layerToErr { + // Simulate a mount failure at this point, which will cause us to rollback. + return errSimulatedFailure + } + return nil + }) + if policyErr != nil { + t.Errorf("failed to EnforceDeviceMountPolicy: %v", policyErr) return false } - if i == layerToErr { - // Simulate a mount failure at this point, which will cause us to rollback - rev.Rollback() - } else { - rev.Commit() - } } - rev = startRevertableSection(t, policy) overlayTarget := getOverlayMountTarget(tid) - err = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) - if !assertDecisionJSONContains(t, err, append(slices.Clone(layerPaths), "no matching containers for overlay")...) { + var policyErr error + err = policy.WithTransaction(func() error { + policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) + if commitOnEnforcementFailure { + return nil + } + return policyErr + }) + if err != nil && commitOnEnforcementFailure { + t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + return false + } + if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPaths), "no matching containers for overlay")...) { return false } - commitOrRollback(rev, commitOnEnforcementFailure) layerPathsWithoutErr := make([]string, 0) for i, layerPath := range layerPaths { @@ -1018,21 +1034,29 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } } - rev = startRevertableSection(t, policy) - err = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPathsWithoutErr, overlayTarget) - if !assertDecisionJSONContains(t, err, append(slices.Clone(layerPathsWithoutErr), "no matching containers for overlay")...) { + err = policy.WithTransaction(func() error { + policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPathsWithoutErr, overlayTarget) + if commitOnEnforcementFailure { + return nil + } + return policyErr + }) + if err != nil && commitOnEnforcementFailure { + t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + return false + } + if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPathsWithoutErr), "no matching containers for overlay")...) { return false } - commitOrRollback(rev, commitOnEnforcementFailure) retryTarget := layerPaths[errLayerPathIndex] - rev = startRevertableSection(t, policy) - err = policy.EnforceDeviceMountPolicy(gc.ctx, retryTarget, tc.Layers[layerToErr]) + err = policy.WithTransaction(func() error { + return policy.EnforceDeviceMountPolicy(gc.ctx, retryTarget, tc.Layers[layerToErr]) + }) if err != nil { t.Errorf("failed to EnforceDeviceMountPolicy again after one previous reverted failure: %v", err) return false } - rev.Commit() err = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) if err != nil { t.Errorf("failed to EnforceOverlayMountPolicy after one previous reverted failure: %v", err) @@ -6214,49 +6238,65 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } + errSimulatedFailure := errors.New("simulated failure") + scratchMountTarget := getScratchDiskMountTarget(containerID) - rev := startRevertableSection(t, policy) - err = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") + err = policy.WithTransaction(func() error { + return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") + }) if err != nil { t.Errorf("Failed to EnforceRWDeviceMountPolicy: %v", err) return false } - rev.Commit() - rev = startRevertableSection(t, policy) overlayTarget := getOverlayMountTarget(containerID) - err = policy.EnforceOverlayMountPolicy(gc.ctx, containerID, layers, overlayTarget) - if err != nil { - t.Errorf("Failed to EnforceOverlayMountPolicy: %v", err) + var policyErr error + err = policy.WithTransaction(func() error { + policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, containerID, layers, overlayTarget) + if policyErr != nil { + return policyErr + } + // Simulate a failure by rolling back the overlay mount. + return errSimulatedFailure + }) + if policyErr != nil { + t.Errorf("Failed to EnforceOverlayMountPolicy: %v", policyErr) return false } - // Simulate a failure by rolling back the overlay mount - rev.Rollback() - rev = startRevertableSection(t, policy) - _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) - if err == nil { + err = policy.WithTransaction(func() error { + _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + if commitOnEnforcementFailure { + return nil + } + return policyErr + }) + if policyErr == nil { t.Errorf("EnforceCreateContainerPolicy should have failed due to missing (reverted) overlay mount") return false } - commitOrRollback(rev, commitOnEnforcementFailure) + if err != nil && commitOnEnforcementFailure { + t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + return false + } // "Retry" overlay mount - rev = startRevertableSection(t, policy) - err = policy.EnforceOverlayMountPolicy(gc.ctx, tc.containerID, layers, overlayTarget) + err = policy.WithTransaction(func() error { + return policy.EnforceOverlayMountPolicy(gc.ctx, tc.containerID, layers, overlayTarget) + }) if err != nil { t.Errorf("Failed to EnforceOverlayMountPolicy: %v", err) return false } - rev.Commit() - rev = startRevertableSection(t, policy) - _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + err = policy.WithTransaction(func() error { + _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + return err + }) if err != nil { t.Errorf("Failed to EnforceCreateContainerPolicy after retrying overlay mount: %v", err) return false } - rev.Commit() return true } @@ -6289,68 +6329,84 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { t.Fatal(err) } + errSimulatedFailure := errors.New("simulated failure") + scratchMountTarget := getScratchDiskMountTarget(containerID) - rev := startRevertableSection(t, policy) - err = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") - if err != nil { - t.Errorf("Failed to EnforceRWDeviceMountPolicy: %v", err) + var policyErr error + err = policy.WithTransaction(func() error { + policyErr = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") + if policyErr != nil { + return policyErr + } + if failScratchMount { + return errSimulatedFailure + } + return nil + }) + if policyErr != nil { + t.Errorf("Failed to EnforceRWDeviceMountPolicy: %v", policyErr) return false } succeedLayerPaths := make([]string, 0) - if failScratchMount { - rev.Rollback() - } else { - rev.Commit() - + if !failScratchMount { // Simulate one of the layers failing to mount, after which the outside // gives up on this container and starts over. layerToErr := testRand.Intn(len(container.Layers)) for i, layerHash := range container.Layers { - rev := startRevertableSection(t, policy) target := testDataGenerator.uniqueLayerMountTarget() - err = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) - if err != nil { - t.Errorf("failed to EnforceDeviceMountPolicy: %v", err) + err = policy.WithTransaction(func() error { + policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) + if policyErr != nil { + return policyErr + } + if i == layerToErr { + // Simulate a mount failure at this point, which will cause us to rollback. + return errSimulatedFailure + } + return nil + }) + if policyErr != nil { + t.Errorf("failed to EnforceDeviceMountPolicy: %v", policyErr) return false } if i == layerToErr { - // Simulate a mount failure at this point, which will cause us to rollback - rev.Rollback() + // The simulated mount failure was rolled back, so the outside + // gives up on this container and starts over. break - } else { - rev.Commit() - succeedLayerPaths = append(succeedLayerPaths, target) } + succeedLayerPaths = append(succeedLayerPaths, target) } for _, layerPath := range succeedLayerPaths { - rev := startRevertableSection(t, policy) - err = policy.EnforceDeviceUnmountPolicy(gc.ctx, layerPath) + err = policy.WithTransaction(func() error { + return policy.EnforceDeviceUnmountPolicy(gc.ctx, layerPath) + }) if err != nil { t.Errorf("Failed to EnforceDeviceUnmountPolicy: %v", err) return false } - rev.Commit() } - rev = startRevertableSection(t, policy) - err = policy.EnforceRWDeviceUnmountPolicy(gc.ctx, scratchMountTarget) + err = policy.WithTransaction(func() error { + return policy.EnforceRWDeviceUnmountPolicy(gc.ctx, scratchMountTarget) + }) if err != nil { t.Errorf("Failed to EnforceRWDeviceUnmountPolicy: %v", err) return false } - rev.Commit() } if testDenyInvalidContainerCreation { - rev = startRevertableSection(t, policy) - _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) - if err == nil { + err = policy.WithTransaction(func() error { + _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) + return policyErr + }) + if policyErr == nil { t.Errorf("EnforceCreateContainerPolicy should have failed due to missing (reverted) overlay mount") + return false } - rev.Rollback() } if newContainerID { diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 6f431429c2..bd48373cbc 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -57,14 +57,6 @@ func init() { registeredEnforcers[openDoorEnforcerName] = createOpenDoorEnforcer } -// Represents an in-progress revertable section. To ensure state is consistent, -// Commit() and Rollback() must not fail, so they do not return anything, and if -// an error does occur they should panic or trigger an unrecoverable error. -type RevertableSectionHandle interface { - Commit() - Rollback() -} - type SecurityPolicyEnforcer interface { EnforceDeviceMountPolicy(ctx context.Context, target string, deviceHash string) (err error) EnforceRWDeviceMountPolicy(ctx context.Context, target string, encrypted, ensureFilesystem bool, filesystem string) (err error) @@ -136,7 +128,7 @@ type SecurityPolicyEnforcer interface { GetUserInfo(spec *oci.Process, rootPath string) (IDName, []IDName, string, error) EnforceVerifiedCIMsPolicy(ctx context.Context, containerID string, layerHashes []string, mountedCim []string) (err error) EnforceRegistryChangesPolicy(ctx context.Context, containerID string, registryValues interface{}) error - StartRevertableSection() (RevertableSectionHandle, error) + WithTransaction(fn func() error) error } //nolint:unused @@ -191,11 +183,6 @@ func CreateSecurityPolicyEnforcer( } } -type nopRevertableSectionHandle struct{} - -func (nopRevertableSectionHandle) Commit() {} -func (nopRevertableSectionHandle) Rollback() {} - type OpenDoorSecurityPolicyEnforcer struct { encodedSecurityPolicy string } @@ -338,8 +325,8 @@ func (OpenDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context.C return nil } -func (*OpenDoorSecurityPolicyEnforcer) StartRevertableSection() (RevertableSectionHandle, error) { - return nopRevertableSectionHandle{}, nil +func (OpenDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { + return fn() } type ClosedDoorSecurityPolicyEnforcer struct{} @@ -471,6 +458,6 @@ func (ClosedDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context return errors.New("registry changes are denied by policy") } -func (*ClosedDoorSecurityPolicyEnforcer) StartRevertableSection() (RevertableSectionHandle, error) { - return nopRevertableSectionHandle{}, nil +func (ClosedDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { + return fn() } diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 3aa8c46fc0..a665f06450 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -60,10 +60,8 @@ type regoEnforcer struct { maxErrorMessageLength int // OS type osType string - // Mutex to ensure only one revertable section is active - revertableSectionLock sync.Mutex - // Saved metadata for the revertable section - savedMetadata rpi.SavedMetadata + // Mutex to ensure only one transaction is active + transactionLock sync.Mutex } var _ SecurityPolicyEnforcer = (*regoEnforcer)(nil) @@ -1198,67 +1196,29 @@ func (policy *regoEnforcer) GetUserInfo(process *oci.Process, rootPath string) ( return GetAllUserInfo(process, rootPath) } -type revertableSectionHandle struct { - // policy is cleared once this struct is "used", to prevent accidental - // duplicate Commit/Rollback calls. - policy *regoEnforcer -} - -// Starts a revertable section by saving the current policy state. If another -// revertable section is already active, this will wait until that one is -// finished. -func (policy *regoEnforcer) StartRevertableSection() (RevertableSectionHandle, error) { - policy.revertableSectionLock.Lock() - var err error - policy.savedMetadata, err = policy.rego.SaveMetadata() - if err != nil { - err = errors.Wrapf(err, "unable to save metadata for revertable section") - policy.revertableSectionLock.Unlock() - return nil, err - } - // Keep policy.revertableSectionLock locked until the end of the section. - sh := &revertableSectionHandle{ - policy: policy, - } - return sh, nil -} - -func (sh *revertableSectionHandle) Commit() { - if sh.policy == nil { - gcs.UnrecoverableError(errors.New("revertable section handle already used")) +// WithTransaction snapshots metadata, runs fn and rolls metadata back if fn +// returns an error. Nested or concurrent transactions are rejected. Returns +// the error from fn if it fails. +func (policy *regoEnforcer) WithTransaction(fn func() error) error { + if !policy.transactionLock.TryLock() { + return errors.New("nested or concurrent policy transactions are not supported") } + defer policy.transactionLock.Unlock() - policy := sh.policy - sh.policy = nil - lockSucc := policy.revertableSectionLock.TryLock() - if lockSucc { - gcs.UnrecoverableError(errors.New("not in a revertable section")) - } else { - // somebody else (i.e. the caller) has the lock, so we're in a revertable - // section. Clear the saved metadata just in case, then unlock to exit the - // section. - policy.savedMetadata = rpi.SavedMetadata{} - policy.revertableSectionLock.Unlock() - } -} - -func (sh *revertableSectionHandle) Rollback() { - if sh.policy == nil { - gcs.UnrecoverableError(errors.New("revertable section handle already used")) + saved, err := policy.rego.SaveMetadata() + if err != nil { + return errors.Wrap(err, "failed to snapshot policy metadata") } - policy := sh.policy - sh.policy = nil - lockSucc := policy.revertableSectionLock.TryLock() - if lockSucc { - gcs.UnrecoverableError(errors.New("not in a revertable section")) - } else { - // somebody else (i.e. the caller) has the lock, so we're in a revertable - // section. Restore the saved metadata, then unlock to exit the section. - err := policy.rego.RestoreMetadata(policy.savedMetadata) - if err != nil { - gcs.UnrecoverableError(errors.Wrap(err, "unable to restore metadata for revertable section")) + err = fn() + if err != nil { + if restoreErr := policy.rego.RestoreMetadata(saved); restoreErr != nil { + gcs.UnrecoverableError(errors.Wrapf(restoreErr, + "failed to rollback policy metadata after error: %v", err)) } - policy.revertableSectionLock.Unlock() + log.G(context.Background()).WithError(err).Warn("rolled back policy metadata due to error") + return err } + + return nil } From 198169743fa8bcb4e288ec1a28c6db77b968abb9 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Wed, 10 Jun 2026 01:39:50 +0000 Subject: [PATCH 07/12] uvmConsistencyError: change cause from string to error Suggested-by: Hamza El-Saawy Assisted-by: GitHub Copilot:claude-opus-4.8 Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index bce8e7aebe..e9c2b943b8 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -128,17 +128,17 @@ type Host struct { type uvmConsistencyError struct { mu sync.Mutex - // A user-friendly error message for why the UVM has entered an inconsistent - // state. If this is empty, there is no error and Check() returns nil. - cause string + // The error describing why the UVM has entered an inconsistent state. If + // this is nil, there is no error and Check() returns nil. + cause error } // Mark that the UVM has entered an inconsistent state, and store the cause if // it is not already set. -func (u *uvmConsistencyError) Set(cause string) { +func (u *uvmConsistencyError) Set(cause error) { u.mu.Lock() defer u.mu.Unlock() - if u.cause == "" { + if u.cause == nil { u.cause = cause } } @@ -147,13 +147,13 @@ func (u *uvmConsistencyError) Set(cause string) { func (u *uvmConsistencyError) Check() error { u.mu.Lock() defer u.mu.Unlock() - if u.cause != "" { - return errors.Errorf( - "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error (%q)", - u.cause, - ) + if u.cause == nil { + return nil } - return nil + return fmt.Errorf( + "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", + u.cause, + ) } func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer securitypolicy.SecurityPolicyEnforcer, logWriter io.Writer) *Host { @@ -437,7 +437,7 @@ func (h *Host) checkState() error { // setUVMInconsistent records that the UVM has entered an inconsistent state and // logs the cause. The flag is only consulted in confidential mode (see // checkState), so this is a no-op for non-confidential hosts. -func (h *Host) setUVMInconsistent(cause string) { +func (h *Host) setUVMInconsistent(cause error) { if !h.HasSecurityPolicy() { return } @@ -1375,7 +1375,7 @@ func (h *Host) modifyMappedVirtualDisk( err = scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, config) if err != nil { h.setUVMInconsistent( - fmt.Sprintf("unmounting scsi device at %s failed: %v", mvd.MountPath, err), + fmt.Errorf("unmounting scsi device at %s failed: %w", mvd.MountPath, err), ) return err } @@ -1420,7 +1420,7 @@ func (h *Host) modifyMappedDirectory( err = storage.UnmountPath(ctx, md.MountPath, true) if err != nil { h.setUVMInconsistent( - fmt.Sprintf("unmounting plan9 device at %s failed: %v", md.MountPath, err), + fmt.Errorf("unmounting plan9 device at %s failed: %w", md.MountPath, err), ) return err } @@ -1489,7 +1489,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, err = pmem.Unmount(ctx, vpd.DeviceNumber, vpd.MountPath, vpd.MappingInfo, verityInfo) if err != nil { h.setUVMInconsistent( - fmt.Sprintf("unmounting pmem device at %s failed: %v", vpd.MountPath, err), + fmt.Errorf("unmounting pmem device at %s failed: %w", vpd.MountPath, err), ) return err } @@ -1596,7 +1596,7 @@ func (h *Host) modifyCombinedLayers( err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) if err != nil { h.setUVMInconsistent( - fmt.Sprintf("unmounting overlay at %s failed: %v", cl.ContainerRootPath, err), + fmt.Errorf("unmounting overlay at %s failed: %w", cl.ContainerRootPath, err), ) return err } From eefab95050d0b5e6bfcdc36ab695aa0e84bf395e Mon Sep 17 00:00:00 2001 From: Maksim An Date: Wed, 10 Jun 2026 00:00:08 -0700 Subject: [PATCH 08/12] review: refactor metadata rollback and error handling - Rename WithTransaction to WithMetadataRollback for clarity - Move unrecoverable error handling from policy layer to GCS layer to fix inverted dependency (pkg/securitypolicy -> internal/gcs) - Replace UnrecoverableError with two-level UVM error state: - Inconsistent (unmount failure): blocks mount/unmount/container create/delete and policy injection, allows shutdown/signal/exec and diagnostics for cleanup and troubleshooting - Unrecoverable (policy metadata rollback failure): blocks all operations except exec and diagnostics, GCS self-terminates via panic after 1 hour grace period - Remove unused internal/gcs/unrecoverable_error.go - Add withPolicyRollback helper on Host to centralize sentinel detection and UVM state management Assisted-by: GitHub Copilot claude-opus-4.8 Signed-off-by: Maksim An --- internal/gcs/unrecoverable_error.go | 53 ----- internal/guest/runtime/hcsv2/uvm.go | 187 +++++++++++++----- pkg/securitypolicy/regopolicy_linux_test.go | 36 ++-- pkg/securitypolicy/securitypolicyenforcer.go | 11 +- .../securitypolicyenforcer_rego.go | 10 +- 5 files changed, 169 insertions(+), 128 deletions(-) delete mode 100644 internal/gcs/unrecoverable_error.go diff --git a/internal/gcs/unrecoverable_error.go b/internal/gcs/unrecoverable_error.go deleted file mode 100644 index 96528088aa..0000000000 --- a/internal/gcs/unrecoverable_error.go +++ /dev/null @@ -1,53 +0,0 @@ -package gcs - -import ( - "context" - "fmt" - "os" - "runtime" - "time" - - "github.com/Microsoft/hcsshim/internal/log" - "github.com/Microsoft/hcsshim/pkg/amdsevsnp" - "github.com/sirupsen/logrus" -) - -// UnrecoverableError logs the error and then puts the current thread into an -// infinite sleep loop. This is to be used instead of panicking, as the -// behaviour of GCS panics is unpredictable. This function can be extended to, -// for example, try to shutdown the VM cleanly. -func UnrecoverableError(err error) { - buf := make([]byte, 300*(1<<10)) - stackSize := runtime.Stack(buf, true) - stackTrace := string(buf[:stackSize]) - - errPrint := fmt.Sprintf( - "Unrecoverable error in GCS: %v\n%s", - err, stackTrace, - ) - - isSnp, err := amdsevsnp.IsSNP() - if err != nil { - // IsSNP() cannot fail on LCOW - // but if it does, we proceed as if we're on SNP to be safe. - isSnp = true - } - - if isSnp { - errPrint += "\nThis thread will now enter an infinite loop." - } - log.G(context.Background()).WithError(err).Logf( - logrus.FatalLevel, - "%s", - errPrint, - ) - - if !isSnp { - panic("Unrecoverable error in GCS: " + err.Error()) - } else { - fmt.Fprintf(os.Stderr, "%s\n", errPrint) - for { - time.Sleep(time.Hour) - } - } -} diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index e9c2b943b8..09cef0d723 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -116,44 +116,83 @@ type Host struct { // hostMounts keeps the state of currently mounted devices and file systems, // which is used for GCS hardening. hostMounts *hostMounts - // uvmError contains a permanent flag to indicate that further mounts, - // unmounts and container creation / deletion should not be allowed. This - // is set when, because of a failure during an unmount operation, we end up - // in a state where the policy enforcer's state is out of sync with what we - // have actually done, but we cannot safely revert its state. + // uvmError tracks two levels of error severity for confidential containers: // - // Only consulted in confidential mode (see Host.checkState). - uvmError uvmConsistencyError + // Inconsistent: set when an unmount operation fails and we cannot safely + // revert the policy enforcer's metadata state. Blocks mount, unmount, + // container creation and deletion, but allows shutdown, signal, exec and + // diagnostics so the host can clean up and operators can troubleshoot. + // + // Unrecoverable: set when the policy metadata rollback itself fails + // (ErrFatalPolicyDesync). The policy state is corrupted beyond repair. + // Blocks all operations except exec and diagnostics. The GCS process + // will self-terminate after a grace period. + // + // Only consulted in confidential mode (see Host.checkInconsistent, + // Host.checkUnrecoverable). + uvmError uvmErrorState } -type uvmConsistencyError struct { - mu sync.Mutex - // The error describing why the UVM has entered an inconsistent state. If - // this is nil, there is no error and Check() returns nil. - cause error +// uvmErrorState tracks two severity levels for UVM errors in confidential mode. +type uvmErrorState struct { + mu sync.Mutex + inconsistent error // unmount failure: policy metadata out of sync + unrecoverable error // policy rollback failure: metadata corrupted } -// Mark that the UVM has entered an inconsistent state, and store the cause if -// it is not already set. -func (u *uvmConsistencyError) Set(cause error) { +// SetInconsistent marks the UVM as having inconsistent policy metadata. +// Stores the cause only if not already set. +func (u *uvmErrorState) SetInconsistent(cause error) { u.mu.Lock() defer u.mu.Unlock() - if u.cause == nil { - u.cause = cause + if u.inconsistent == nil { + u.inconsistent = cause } } -// Check returns a non-nil error if the UVM has been marked inconsistent. -func (u *uvmConsistencyError) Check() error { +// SetUnrecoverable marks the UVM as having corrupted policy metadata. +// Stores the cause only if not already set. +func (u *uvmErrorState) SetUnrecoverable(cause error) { u.mu.Lock() defer u.mu.Unlock() - if u.cause == nil { - return nil + if u.unrecoverable == nil { + u.unrecoverable = cause } - return fmt.Errorf( - "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", - u.cause, - ) +} + +// CheckInconsistent returns a non-nil error if the UVM is inconsistent OR +// unrecoverable. Used to gate mount/unmount/container create/delete. +func (u *uvmErrorState) CheckInconsistent() error { + u.mu.Lock() + defer u.mu.Unlock() + if u.unrecoverable != nil { + return fmt.Errorf( + "all operations have been disabled in this UVM due to an unrecoverable error: %w", + u.unrecoverable, + ) + } + if u.inconsistent != nil { + return fmt.Errorf( + "mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", + u.inconsistent, + ) + } + return nil +} + +// CheckUnrecoverable returns a non-nil error only if the UVM is in the +// unrecoverable state. Used to gate operations like shutdown and signal +// that should still work when merely inconsistent. +func (u *uvmErrorState) CheckUnrecoverable() error { + u.mu.Lock() + defer u.mu.Unlock() + if u.unrecoverable != nil { + return fmt.Errorf( + "all operations have been disabled in this UVM due to an unrecoverable error: %w", + u.unrecoverable, + ) + } + return nil } func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer securitypolicy.SecurityPolicyEnforcer, logWriter io.Writer) *Host { @@ -173,7 +212,6 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s devNullTransport: &transport.DevNullTransport{}, hostMounts: newHostMounts(), securityOptions: securityPolicyOptions, - uvmError: uvmConsistencyError{}, } } @@ -425,26 +463,71 @@ func checkContainerSettings(sandboxID, containerID string, settings *prot.VMHost return nil } -// checkState returns an error if the UVM has entered an inconsistent state from -// which it cannot safely recover. Only enforced for confidential containers. -func (h *Host) checkState() error { +// checkInconsistent returns an error if the UVM is inconsistent or +// unrecoverable. Used to gate mount/unmount/container create/delete. +// Only enforced for confidential containers. +func (h *Host) checkInconsistent() error { + if h.HasSecurityPolicy() { + return h.uvmError.CheckInconsistent() + } + return nil +} + +// checkUnrecoverable returns an error only if the UVM is in the unrecoverable +// state. Used to gate shutdown/signal - operations that should still work +// when the UVM is merely inconsistent. Only enforced for confidential containers. +func (h *Host) checkUnrecoverable() error { if h.HasSecurityPolicy() { - return h.uvmError.Check() + return h.uvmError.CheckUnrecoverable() } return nil } -// setUVMInconsistent records that the UVM has entered an inconsistent state and -// logs the cause. The flag is only consulted in confidential mode (see -// checkState), so this is a no-op for non-confidential hosts. +// setUVMInconsistent records that the UVM has inconsistent policy metadata due +// to an unmount failure. Blocks mount/unmount/container operations. +// No-op for non-confidential hosts. func (h *Host) setUVMInconsistent(cause error) { if !h.HasSecurityPolicy() { return } - h.uvmError.Set(cause) + h.uvmError.SetInconsistent(cause) log.G(context.Background()).WithFields(logrus.Fields{ "cause": cause, - }).Error("Host marked inconsistent. All further mounts/unmounts, container creation and deletion will fail.") + }).Error("Host marked inconsistent. Mount/unmount and container create/delete will fail.") +} + +// gracePeriodBeforeExit is how long the GCS stays alive after an unrecoverable +// error to allow diagnostics before self-terminating. +const gracePeriodBeforeExit = 1 * time.Hour + +// setUVMUnrecoverable records that the UVM has corrupted policy metadata. +// Blocks all operations. Spawns a goroutine to self-terminate the GCS after +// a grace period. No-op for non-confidential hosts. +func (h *Host) setUVMUnrecoverable(cause error) { + if !h.HasSecurityPolicy() { + return + } + h.uvmError.SetUnrecoverable(cause) + log.G(context.Background()).WithFields(logrus.Fields{ + "cause": cause, + "gracePeriod": gracePeriodBeforeExit, + }).Error("Host marked unrecoverable. All operations will fail. GCS will self-terminate after grace period.") + + go func() { + time.Sleep(gracePeriodBeforeExit) + panic(fmt.Sprintf("GCS terminating after grace period due to unrecoverable error: %v", cause)) + }() +} + +// withPolicyRollback runs fn inside WithMetadataRollback on the policy enforcer. +// If it returns ErrFatalPolicyDesync the UVM is marked unrecoverable; all +// operations will be blocked and the GCS will self-terminate after a grace period. +func (h *Host) withPolicyRollback(fn func() error) error { + err := h.securityOptions.PolicyEnforcer.WithMetadataRollback(fn) + if errors.Is(err, securitypolicy.ErrFatalPolicyDesync) { + h.setUVMUnrecoverable(err) + } + return err } func checkExists(path string) (bool, error) { @@ -458,7 +541,7 @@ func checkExists(path string) (bool, error) { } func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { - if err = h.checkState(); err != nil { + if err = h.checkInconsistent(); err != nil { return nil, err } @@ -813,10 +896,9 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * case guestresource.ResourceTypeSCSIDevice: return modifySCSIDevice(ctx, req.RequestType, req.Settings.(*guestresource.SCSIDevice)) case guestresource.ResourceTypeMappedVirtualDisk: - if err := h.checkState(); err != nil { + if err := h.checkInconsistent(); err != nil { return err } - mvd := req.Settings.(*guestresource.LCOWMappedVirtualDisk) // find the actual controller number on the bus and update the incoming request. var cNum uint8 @@ -856,22 +938,19 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * } return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd) case guestresource.ResourceTypeMappedDirectory: - if err := h.checkState(); err != nil { + if err := h.checkInconsistent(); err != nil { return err } - return h.modifyMappedDirectory(ctx, h.vsock, req.RequestType, req.Settings.(*guestresource.LCOWMappedDirectory)) case guestresource.ResourceTypeVPMemDevice: - if err := h.checkState(); err != nil { + if err := h.checkInconsistent(); err != nil { return err } - return h.modifyMappedVPMemDevice(ctx, req.RequestType, req.Settings.(*guestresource.LCOWMappedVPMemDevice)) case guestresource.ResourceTypeCombinedLayers: - if err := h.checkState(); err != nil { + if err := h.checkInconsistent(); err != nil { return err } - cl := req.Settings.(*guestresource.LCOWCombinedLayers) // when cl.ScratchPath == "", we mount overlay as read-only, in which case // we don't really care about scratch encryption, since the host already @@ -889,6 +968,9 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * } return c.modifyContainerConstraints(ctx, req.RequestType, req.Settings.(*guestresource.LCOWContainerConstraints)) case guestresource.ResourceTypeSecurityPolicy: + if err := h.checkInconsistent(); err != nil { + return err + } r, ok := req.Settings.(*guestresource.ConfidentialOptions) if !ok { return errors.New("the request's settings are not of type ConfidentialOptions") @@ -898,6 +980,9 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * r.EncodedSecurityPolicy, r.EncodedUVMReference) case guestresource.ResourceTypePolicyFragment: + if err := h.checkInconsistent(); err != nil { + return err + } r, ok := req.Settings.(*guestresource.SecurityPolicyFragment) if !ok { return errors.New("the request settings are not of type SecurityPolicyFragment") @@ -943,6 +1028,9 @@ func (*Host) Shutdown() { // Called to shutdown a container func (h *Host) ShutdownContainer(ctx context.Context, containerID string, graceful bool) error { + if err := h.checkUnrecoverable(); err != nil { + return err + } c, err := h.GetCreatedContainer(containerID) if err != nil { return err @@ -962,6 +1050,9 @@ func (h *Host) ShutdownContainer(ctx context.Context, containerID string, gracef } func (h *Host) SignalContainerProcess(ctx context.Context, containerID string, processID uint32, signal syscall.Signal) error { + if err := h.checkUnrecoverable(); err != nil { + return err + } c, err := h.GetCreatedContainer(containerID) if err != nil { return err @@ -1296,7 +1387,7 @@ func (h *Host) modifyMappedVirtualDisk( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations by marking the UVM state as inconsistent. - return securityPolicy.WithTransaction(func() error { + return h.withPolicyRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) @@ -1398,7 +1489,7 @@ func (h *Host) modifyMappedDirectory( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return securityPolicy.WithTransaction(func() error { + return h.withPolicyRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforcePlan9MountPolicy(ctx, md.MountPath) @@ -1453,7 +1544,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return securityPolicy.WithTransaction(func() error { + return h.withPolicyRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforceDeviceMountPolicy(ctx, vpd.MountPath, deviceHash) @@ -1522,7 +1613,7 @@ func (h *Host) modifyCombinedLayers( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return securityPolicy.WithTransaction(func() error { + return h.withPolicyRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: if h.HasSecurityPolicy() { diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 3bf07f75ad..236e64d4bd 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -978,7 +978,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { errSimulatedFailure := errors.New("simulated failure") - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchTarget, true, true, "xfs") }) if err != nil { @@ -993,7 +993,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { target := testDataGenerator.uniqueLayerMountTarget() layerPaths[len(tc.Layers)-i-1] = target var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) if policyErr != nil { return policyErr @@ -1012,7 +1012,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { overlayTarget := getOverlayMountTarget(tid) var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) if commitOnEnforcementFailure { return nil @@ -1020,7 +1020,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { return policyErr }) if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) return false } if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPaths), "no matching containers for overlay")...) { @@ -1034,7 +1034,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPathsWithoutErr, overlayTarget) if commitOnEnforcementFailure { return nil @@ -1042,7 +1042,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { return policyErr }) if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) return false } if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPathsWithoutErr), "no matching containers for overlay")...) { @@ -1050,7 +1050,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } retryTarget := layerPaths[errLayerPathIndex] - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceDeviceMountPolicy(gc.ctx, retryTarget, tc.Layers[layerToErr]) }) if err != nil { @@ -6241,7 +6241,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { errSimulatedFailure := errors.New("simulated failure") scratchMountTarget := getScratchDiskMountTarget(containerID) - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") }) if err != nil { @@ -6251,7 +6251,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { overlayTarget := getOverlayMountTarget(containerID) var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, containerID, layers, overlayTarget) if policyErr != nil { return policyErr @@ -6264,7 +6264,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) if commitOnEnforcementFailure { return nil @@ -6276,12 +6276,12 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) return false } // "Retry" overlay mount - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceOverlayMountPolicy(gc.ctx, tc.containerID, layers, overlayTarget) }) if err != nil { @@ -6289,7 +6289,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) return err }) @@ -6333,7 +6333,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { scratchMountTarget := getScratchDiskMountTarget(containerID) var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") if policyErr != nil { return policyErr @@ -6356,7 +6356,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { layerToErr := testRand.Intn(len(container.Layers)) for i, layerHash := range container.Layers { target := testDataGenerator.uniqueLayerMountTarget() - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) if policyErr != nil { return policyErr @@ -6380,7 +6380,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } for _, layerPath := range succeedLayerPaths { - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceDeviceUnmountPolicy(gc.ctx, layerPath) }) if err != nil { @@ -6389,7 +6389,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceRWDeviceUnmountPolicy(gc.ctx, scratchMountTarget) }) if err != nil { @@ -6399,7 +6399,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } if testDenyInvalidContainerCreation { - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) return policyErr }) diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index bd48373cbc..0a497d0db9 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -57,6 +57,11 @@ func init() { registeredEnforcers[openDoorEnforcerName] = createOpenDoorEnforcer } +// ErrFatalPolicyDesync indicates that policy metadata could not be restored +// after a failed operation, leaving the enforcer in an unknown state. Callers +// should treat this as unrecoverable and block all further operations. +var ErrFatalPolicyDesync = errors.New("fatal policy metadata desync") + type SecurityPolicyEnforcer interface { EnforceDeviceMountPolicy(ctx context.Context, target string, deviceHash string) (err error) EnforceRWDeviceMountPolicy(ctx context.Context, target string, encrypted, ensureFilesystem bool, filesystem string) (err error) @@ -128,7 +133,7 @@ type SecurityPolicyEnforcer interface { GetUserInfo(spec *oci.Process, rootPath string) (IDName, []IDName, string, error) EnforceVerifiedCIMsPolicy(ctx context.Context, containerID string, layerHashes []string, mountedCim []string) (err error) EnforceRegistryChangesPolicy(ctx context.Context, containerID string, registryValues interface{}) error - WithTransaction(fn func() error) error + WithMetadataRollback(fn func() error) error } //nolint:unused @@ -325,7 +330,7 @@ func (OpenDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context.C return nil } -func (OpenDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { +func (OpenDoorSecurityPolicyEnforcer) WithMetadataRollback(fn func() error) error { return fn() } @@ -458,6 +463,6 @@ func (ClosedDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context return errors.New("registry changes are denied by policy") } -func (ClosedDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { +func (ClosedDoorSecurityPolicyEnforcer) WithMetadataRollback(fn func() error) error { return fn() } diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index a665f06450..f37c73b695 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -15,7 +15,6 @@ import ( "sync" "syscall" - "github.com/Microsoft/hcsshim/internal/gcs" "github.com/Microsoft/hcsshim/internal/guestpath" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -1196,10 +1195,10 @@ func (policy *regoEnforcer) GetUserInfo(process *oci.Process, rootPath string) ( return GetAllUserInfo(process, rootPath) } -// WithTransaction snapshots metadata, runs fn and rolls metadata back if fn -// returns an error. Nested or concurrent transactions are rejected. Returns +// WithMetadataRollback snapshots metadata, runs fn and rolls metadata back if fn +// returns an error. Nested or concurrent calls are rejected. Returns // the error from fn if it fails. -func (policy *regoEnforcer) WithTransaction(fn func() error) error { +func (policy *regoEnforcer) WithMetadataRollback(fn func() error) error { if !policy.transactionLock.TryLock() { return errors.New("nested or concurrent policy transactions are not supported") } @@ -1213,8 +1212,7 @@ func (policy *regoEnforcer) WithTransaction(fn func() error) error { err = fn() if err != nil { if restoreErr := policy.rego.RestoreMetadata(saved); restoreErr != nil { - gcs.UnrecoverableError(errors.Wrapf(restoreErr, - "failed to rollback policy metadata after error: %v", err)) + return fmt.Errorf("%w: restore failed (%v) after: %v", ErrFatalPolicyDesync, restoreErr, err) } log.G(context.Background()).WithError(err).Warn("rolled back policy metadata due to error") return err From 9dc22cfad02e14cc5c1592156420cd67f1c43faf Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Wed, 10 Jun 2026 22:46:32 +0000 Subject: [PATCH 09/12] Revert "review: refactor metadata rollback and error handling" After discussion with Maksim, we decided to keep the `for { sleep } ` loop approach for now until the gcs-sidecar can be made a critical process and we're confident that taking GCS down will take down the UVM, rather than e.g. restart the GCS. This reverts commit eefab95050d0b5e6bfcdc36ab695aa0e84bf395e. Signed-off-by: Tingmao Wang --- internal/gcs/unrecoverable_error.go | 53 +++++ internal/guest/runtime/hcsv2/uvm.go | 187 +++++------------- pkg/securitypolicy/regopolicy_linux_test.go | 36 ++-- pkg/securitypolicy/securitypolicyenforcer.go | 11 +- .../securitypolicyenforcer_rego.go | 10 +- 5 files changed, 128 insertions(+), 169 deletions(-) create mode 100644 internal/gcs/unrecoverable_error.go diff --git a/internal/gcs/unrecoverable_error.go b/internal/gcs/unrecoverable_error.go new file mode 100644 index 0000000000..96528088aa --- /dev/null +++ b/internal/gcs/unrecoverable_error.go @@ -0,0 +1,53 @@ +package gcs + +import ( + "context" + "fmt" + "os" + "runtime" + "time" + + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/pkg/amdsevsnp" + "github.com/sirupsen/logrus" +) + +// UnrecoverableError logs the error and then puts the current thread into an +// infinite sleep loop. This is to be used instead of panicking, as the +// behaviour of GCS panics is unpredictable. This function can be extended to, +// for example, try to shutdown the VM cleanly. +func UnrecoverableError(err error) { + buf := make([]byte, 300*(1<<10)) + stackSize := runtime.Stack(buf, true) + stackTrace := string(buf[:stackSize]) + + errPrint := fmt.Sprintf( + "Unrecoverable error in GCS: %v\n%s", + err, stackTrace, + ) + + isSnp, err := amdsevsnp.IsSNP() + if err != nil { + // IsSNP() cannot fail on LCOW + // but if it does, we proceed as if we're on SNP to be safe. + isSnp = true + } + + if isSnp { + errPrint += "\nThis thread will now enter an infinite loop." + } + log.G(context.Background()).WithError(err).Logf( + logrus.FatalLevel, + "%s", + errPrint, + ) + + if !isSnp { + panic("Unrecoverable error in GCS: " + err.Error()) + } else { + fmt.Fprintf(os.Stderr, "%s\n", errPrint) + for { + time.Sleep(time.Hour) + } + } +} diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 09cef0d723..e9c2b943b8 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -116,83 +116,44 @@ type Host struct { // hostMounts keeps the state of currently mounted devices and file systems, // which is used for GCS hardening. hostMounts *hostMounts - // uvmError tracks two levels of error severity for confidential containers: + // uvmError contains a permanent flag to indicate that further mounts, + // unmounts and container creation / deletion should not be allowed. This + // is set when, because of a failure during an unmount operation, we end up + // in a state where the policy enforcer's state is out of sync with what we + // have actually done, but we cannot safely revert its state. // - // Inconsistent: set when an unmount operation fails and we cannot safely - // revert the policy enforcer's metadata state. Blocks mount, unmount, - // container creation and deletion, but allows shutdown, signal, exec and - // diagnostics so the host can clean up and operators can troubleshoot. - // - // Unrecoverable: set when the policy metadata rollback itself fails - // (ErrFatalPolicyDesync). The policy state is corrupted beyond repair. - // Blocks all operations except exec and diagnostics. The GCS process - // will self-terminate after a grace period. - // - // Only consulted in confidential mode (see Host.checkInconsistent, - // Host.checkUnrecoverable). - uvmError uvmErrorState -} - -// uvmErrorState tracks two severity levels for UVM errors in confidential mode. -type uvmErrorState struct { - mu sync.Mutex - inconsistent error // unmount failure: policy metadata out of sync - unrecoverable error // policy rollback failure: metadata corrupted + // Only consulted in confidential mode (see Host.checkState). + uvmError uvmConsistencyError } -// SetInconsistent marks the UVM as having inconsistent policy metadata. -// Stores the cause only if not already set. -func (u *uvmErrorState) SetInconsistent(cause error) { - u.mu.Lock() - defer u.mu.Unlock() - if u.inconsistent == nil { - u.inconsistent = cause - } +type uvmConsistencyError struct { + mu sync.Mutex + // The error describing why the UVM has entered an inconsistent state. If + // this is nil, there is no error and Check() returns nil. + cause error } -// SetUnrecoverable marks the UVM as having corrupted policy metadata. -// Stores the cause only if not already set. -func (u *uvmErrorState) SetUnrecoverable(cause error) { +// Mark that the UVM has entered an inconsistent state, and store the cause if +// it is not already set. +func (u *uvmConsistencyError) Set(cause error) { u.mu.Lock() defer u.mu.Unlock() - if u.unrecoverable == nil { - u.unrecoverable = cause + if u.cause == nil { + u.cause = cause } } -// CheckInconsistent returns a non-nil error if the UVM is inconsistent OR -// unrecoverable. Used to gate mount/unmount/container create/delete. -func (u *uvmErrorState) CheckInconsistent() error { +// Check returns a non-nil error if the UVM has been marked inconsistent. +func (u *uvmConsistencyError) Check() error { u.mu.Lock() defer u.mu.Unlock() - if u.unrecoverable != nil { - return fmt.Errorf( - "all operations have been disabled in this UVM due to an unrecoverable error: %w", - u.unrecoverable, - ) - } - if u.inconsistent != nil { - return fmt.Errorf( - "mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", - u.inconsistent, - ) - } - return nil -} - -// CheckUnrecoverable returns a non-nil error only if the UVM is in the -// unrecoverable state. Used to gate operations like shutdown and signal -// that should still work when merely inconsistent. -func (u *uvmErrorState) CheckUnrecoverable() error { - u.mu.Lock() - defer u.mu.Unlock() - if u.unrecoverable != nil { - return fmt.Errorf( - "all operations have been disabled in this UVM due to an unrecoverable error: %w", - u.unrecoverable, - ) + if u.cause == nil { + return nil } - return nil + return fmt.Errorf( + "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", + u.cause, + ) } func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer securitypolicy.SecurityPolicyEnforcer, logWriter io.Writer) *Host { @@ -212,6 +173,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s devNullTransport: &transport.DevNullTransport{}, hostMounts: newHostMounts(), securityOptions: securityPolicyOptions, + uvmError: uvmConsistencyError{}, } } @@ -463,71 +425,26 @@ func checkContainerSettings(sandboxID, containerID string, settings *prot.VMHost return nil } -// checkInconsistent returns an error if the UVM is inconsistent or -// unrecoverable. Used to gate mount/unmount/container create/delete. -// Only enforced for confidential containers. -func (h *Host) checkInconsistent() error { - if h.HasSecurityPolicy() { - return h.uvmError.CheckInconsistent() - } - return nil -} - -// checkUnrecoverable returns an error only if the UVM is in the unrecoverable -// state. Used to gate shutdown/signal - operations that should still work -// when the UVM is merely inconsistent. Only enforced for confidential containers. -func (h *Host) checkUnrecoverable() error { +// checkState returns an error if the UVM has entered an inconsistent state from +// which it cannot safely recover. Only enforced for confidential containers. +func (h *Host) checkState() error { if h.HasSecurityPolicy() { - return h.uvmError.CheckUnrecoverable() + return h.uvmError.Check() } return nil } -// setUVMInconsistent records that the UVM has inconsistent policy metadata due -// to an unmount failure. Blocks mount/unmount/container operations. -// No-op for non-confidential hosts. +// setUVMInconsistent records that the UVM has entered an inconsistent state and +// logs the cause. The flag is only consulted in confidential mode (see +// checkState), so this is a no-op for non-confidential hosts. func (h *Host) setUVMInconsistent(cause error) { if !h.HasSecurityPolicy() { return } - h.uvmError.SetInconsistent(cause) + h.uvmError.Set(cause) log.G(context.Background()).WithFields(logrus.Fields{ "cause": cause, - }).Error("Host marked inconsistent. Mount/unmount and container create/delete will fail.") -} - -// gracePeriodBeforeExit is how long the GCS stays alive after an unrecoverable -// error to allow diagnostics before self-terminating. -const gracePeriodBeforeExit = 1 * time.Hour - -// setUVMUnrecoverable records that the UVM has corrupted policy metadata. -// Blocks all operations. Spawns a goroutine to self-terminate the GCS after -// a grace period. No-op for non-confidential hosts. -func (h *Host) setUVMUnrecoverable(cause error) { - if !h.HasSecurityPolicy() { - return - } - h.uvmError.SetUnrecoverable(cause) - log.G(context.Background()).WithFields(logrus.Fields{ - "cause": cause, - "gracePeriod": gracePeriodBeforeExit, - }).Error("Host marked unrecoverable. All operations will fail. GCS will self-terminate after grace period.") - - go func() { - time.Sleep(gracePeriodBeforeExit) - panic(fmt.Sprintf("GCS terminating after grace period due to unrecoverable error: %v", cause)) - }() -} - -// withPolicyRollback runs fn inside WithMetadataRollback on the policy enforcer. -// If it returns ErrFatalPolicyDesync the UVM is marked unrecoverable; all -// operations will be blocked and the GCS will self-terminate after a grace period. -func (h *Host) withPolicyRollback(fn func() error) error { - err := h.securityOptions.PolicyEnforcer.WithMetadataRollback(fn) - if errors.Is(err, securitypolicy.ErrFatalPolicyDesync) { - h.setUVMUnrecoverable(err) - } - return err + }).Error("Host marked inconsistent. All further mounts/unmounts, container creation and deletion will fail.") } func checkExists(path string) (bool, error) { @@ -541,7 +458,7 @@ func checkExists(path string) (bool, error) { } func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { - if err = h.checkInconsistent(); err != nil { + if err = h.checkState(); err != nil { return nil, err } @@ -896,9 +813,10 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * case guestresource.ResourceTypeSCSIDevice: return modifySCSIDevice(ctx, req.RequestType, req.Settings.(*guestresource.SCSIDevice)) case guestresource.ResourceTypeMappedVirtualDisk: - if err := h.checkInconsistent(); err != nil { + if err := h.checkState(); err != nil { return err } + mvd := req.Settings.(*guestresource.LCOWMappedVirtualDisk) // find the actual controller number on the bus and update the incoming request. var cNum uint8 @@ -938,19 +856,22 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * } return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd) case guestresource.ResourceTypeMappedDirectory: - if err := h.checkInconsistent(); err != nil { + if err := h.checkState(); err != nil { return err } + return h.modifyMappedDirectory(ctx, h.vsock, req.RequestType, req.Settings.(*guestresource.LCOWMappedDirectory)) case guestresource.ResourceTypeVPMemDevice: - if err := h.checkInconsistent(); err != nil { + if err := h.checkState(); err != nil { return err } + return h.modifyMappedVPMemDevice(ctx, req.RequestType, req.Settings.(*guestresource.LCOWMappedVPMemDevice)) case guestresource.ResourceTypeCombinedLayers: - if err := h.checkInconsistent(); err != nil { + if err := h.checkState(); err != nil { return err } + cl := req.Settings.(*guestresource.LCOWCombinedLayers) // when cl.ScratchPath == "", we mount overlay as read-only, in which case // we don't really care about scratch encryption, since the host already @@ -968,9 +889,6 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * } return c.modifyContainerConstraints(ctx, req.RequestType, req.Settings.(*guestresource.LCOWContainerConstraints)) case guestresource.ResourceTypeSecurityPolicy: - if err := h.checkInconsistent(); err != nil { - return err - } r, ok := req.Settings.(*guestresource.ConfidentialOptions) if !ok { return errors.New("the request's settings are not of type ConfidentialOptions") @@ -980,9 +898,6 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * r.EncodedSecurityPolicy, r.EncodedUVMReference) case guestresource.ResourceTypePolicyFragment: - if err := h.checkInconsistent(); err != nil { - return err - } r, ok := req.Settings.(*guestresource.SecurityPolicyFragment) if !ok { return errors.New("the request settings are not of type SecurityPolicyFragment") @@ -1028,9 +943,6 @@ func (*Host) Shutdown() { // Called to shutdown a container func (h *Host) ShutdownContainer(ctx context.Context, containerID string, graceful bool) error { - if err := h.checkUnrecoverable(); err != nil { - return err - } c, err := h.GetCreatedContainer(containerID) if err != nil { return err @@ -1050,9 +962,6 @@ func (h *Host) ShutdownContainer(ctx context.Context, containerID string, gracef } func (h *Host) SignalContainerProcess(ctx context.Context, containerID string, processID uint32, signal syscall.Signal) error { - if err := h.checkUnrecoverable(); err != nil { - return err - } c, err := h.GetCreatedContainer(containerID) if err != nil { return err @@ -1387,7 +1296,7 @@ func (h *Host) modifyMappedVirtualDisk( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations by marking the UVM state as inconsistent. - return h.withPolicyRollback(func() error { + return securityPolicy.WithTransaction(func() error { switch rt { case guestrequest.RequestTypeAdd: mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) @@ -1489,7 +1398,7 @@ func (h *Host) modifyMappedDirectory( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return h.withPolicyRollback(func() error { + return securityPolicy.WithTransaction(func() error { switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforcePlan9MountPolicy(ctx, md.MountPath) @@ -1544,7 +1453,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return h.withPolicyRollback(func() error { + return securityPolicy.WithTransaction(func() error { switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforceDeviceMountPolicy(ctx, vpd.MountPath, deviceHash) @@ -1613,7 +1522,7 @@ func (h *Host) modifyCombinedLayers( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return h.withPolicyRollback(func() error { + return securityPolicy.WithTransaction(func() error { switch rt { case guestrequest.RequestTypeAdd: if h.HasSecurityPolicy() { diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 236e64d4bd..3bf07f75ad 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -978,7 +978,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { errSimulatedFailure := errors.New("simulated failure") - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchTarget, true, true, "xfs") }) if err != nil { @@ -993,7 +993,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { target := testDataGenerator.uniqueLayerMountTarget() layerPaths[len(tc.Layers)-i-1] = target var policyErr error - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) if policyErr != nil { return policyErr @@ -1012,7 +1012,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { overlayTarget := getOverlayMountTarget(tid) var policyErr error - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) if commitOnEnforcementFailure { return nil @@ -1020,7 +1020,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { return policyErr }) if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) + t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) return false } if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPaths), "no matching containers for overlay")...) { @@ -1034,7 +1034,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } } - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPathsWithoutErr, overlayTarget) if commitOnEnforcementFailure { return nil @@ -1042,7 +1042,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { return policyErr }) if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) + t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) return false } if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPathsWithoutErr), "no matching containers for overlay")...) { @@ -1050,7 +1050,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } retryTarget := layerPaths[errLayerPathIndex] - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { return policy.EnforceDeviceMountPolicy(gc.ctx, retryTarget, tc.Layers[layerToErr]) }) if err != nil { @@ -6241,7 +6241,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { errSimulatedFailure := errors.New("simulated failure") scratchMountTarget := getScratchDiskMountTarget(containerID) - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") }) if err != nil { @@ -6251,7 +6251,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { overlayTarget := getOverlayMountTarget(containerID) var policyErr error - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, containerID, layers, overlayTarget) if policyErr != nil { return policyErr @@ -6264,7 +6264,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) if commitOnEnforcementFailure { return nil @@ -6276,12 +6276,12 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) + t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) return false } // "Retry" overlay mount - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { return policy.EnforceOverlayMountPolicy(gc.ctx, tc.containerID, layers, overlayTarget) }) if err != nil { @@ -6289,7 +6289,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) return err }) @@ -6333,7 +6333,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { scratchMountTarget := getScratchDiskMountTarget(containerID) var policyErr error - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { policyErr = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") if policyErr != nil { return policyErr @@ -6356,7 +6356,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { layerToErr := testRand.Intn(len(container.Layers)) for i, layerHash := range container.Layers { target := testDataGenerator.uniqueLayerMountTarget() - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) if policyErr != nil { return policyErr @@ -6380,7 +6380,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } for _, layerPath := range succeedLayerPaths { - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { return policy.EnforceDeviceUnmountPolicy(gc.ctx, layerPath) }) if err != nil { @@ -6389,7 +6389,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } } - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { return policy.EnforceRWDeviceUnmountPolicy(gc.ctx, scratchMountTarget) }) if err != nil { @@ -6399,7 +6399,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } if testDenyInvalidContainerCreation { - err = policy.WithMetadataRollback(func() error { + err = policy.WithTransaction(func() error { _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) return policyErr }) diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 0a497d0db9..bd48373cbc 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -57,11 +57,6 @@ func init() { registeredEnforcers[openDoorEnforcerName] = createOpenDoorEnforcer } -// ErrFatalPolicyDesync indicates that policy metadata could not be restored -// after a failed operation, leaving the enforcer in an unknown state. Callers -// should treat this as unrecoverable and block all further operations. -var ErrFatalPolicyDesync = errors.New("fatal policy metadata desync") - type SecurityPolicyEnforcer interface { EnforceDeviceMountPolicy(ctx context.Context, target string, deviceHash string) (err error) EnforceRWDeviceMountPolicy(ctx context.Context, target string, encrypted, ensureFilesystem bool, filesystem string) (err error) @@ -133,7 +128,7 @@ type SecurityPolicyEnforcer interface { GetUserInfo(spec *oci.Process, rootPath string) (IDName, []IDName, string, error) EnforceVerifiedCIMsPolicy(ctx context.Context, containerID string, layerHashes []string, mountedCim []string) (err error) EnforceRegistryChangesPolicy(ctx context.Context, containerID string, registryValues interface{}) error - WithMetadataRollback(fn func() error) error + WithTransaction(fn func() error) error } //nolint:unused @@ -330,7 +325,7 @@ func (OpenDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context.C return nil } -func (OpenDoorSecurityPolicyEnforcer) WithMetadataRollback(fn func() error) error { +func (OpenDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { return fn() } @@ -463,6 +458,6 @@ func (ClosedDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context return errors.New("registry changes are denied by policy") } -func (ClosedDoorSecurityPolicyEnforcer) WithMetadataRollback(fn func() error) error { +func (ClosedDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { return fn() } diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index f37c73b695..a665f06450 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -15,6 +15,7 @@ import ( "sync" "syscall" + "github.com/Microsoft/hcsshim/internal/gcs" "github.com/Microsoft/hcsshim/internal/guestpath" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -1195,10 +1196,10 @@ func (policy *regoEnforcer) GetUserInfo(process *oci.Process, rootPath string) ( return GetAllUserInfo(process, rootPath) } -// WithMetadataRollback snapshots metadata, runs fn and rolls metadata back if fn -// returns an error. Nested or concurrent calls are rejected. Returns +// WithTransaction snapshots metadata, runs fn and rolls metadata back if fn +// returns an error. Nested or concurrent transactions are rejected. Returns // the error from fn if it fails. -func (policy *regoEnforcer) WithMetadataRollback(fn func() error) error { +func (policy *regoEnforcer) WithTransaction(fn func() error) error { if !policy.transactionLock.TryLock() { return errors.New("nested or concurrent policy transactions are not supported") } @@ -1212,7 +1213,8 @@ func (policy *regoEnforcer) WithMetadataRollback(fn func() error) error { err = fn() if err != nil { if restoreErr := policy.rego.RestoreMetadata(saved); restoreErr != nil { - return fmt.Errorf("%w: restore failed (%v) after: %v", ErrFatalPolicyDesync, restoreErr, err) + gcs.UnrecoverableError(errors.Wrapf(restoreErr, + "failed to rollback policy metadata after error: %v", err)) } log.G(context.Background()).WithError(err).Warn("rolled back policy metadata due to error") return err From b962ce3a8df6da6b5aa1c7664390728b607ebab2 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Wed, 10 Jun 2026 22:50:44 +0000 Subject: [PATCH 10/12] securitypolicyenforcer: Rename WithTransaction to WithMetadataRollback Co-authored-by: Maksim An Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 8 ++--- pkg/securitypolicy/regopolicy_linux_test.go | 36 +++++++++---------- pkg/securitypolicy/securitypolicyenforcer.go | 6 ++-- .../securitypolicyenforcer_rego.go | 4 +-- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index e9c2b943b8..20d25ad194 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -1296,7 +1296,7 @@ func (h *Host) modifyMappedVirtualDisk( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations by marking the UVM state as inconsistent. - return securityPolicy.WithTransaction(func() error { + return securityPolicy.WithMetadataRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) @@ -1398,7 +1398,7 @@ func (h *Host) modifyMappedDirectory( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return securityPolicy.WithTransaction(func() error { + return securityPolicy.WithMetadataRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforcePlan9MountPolicy(ctx, md.MountPath) @@ -1453,7 +1453,7 @@ func (h *Host) modifyMappedVPMemDevice(ctx context.Context, // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return securityPolicy.WithTransaction(func() error { + return securityPolicy.WithMetadataRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: err = securityPolicy.EnforceDeviceMountPolicy(ctx, vpd.MountPath, deviceHash) @@ -1522,7 +1522,7 @@ func (h *Host) modifyCombinedLayers( // transaction rollback mechanism on both mount and unmount errors, but if // we've actually called Unmount and it fails we permanently block further // device operations. - return securityPolicy.WithTransaction(func() error { + return securityPolicy.WithMetadataRollback(func() error { switch rt { case guestrequest.RequestTypeAdd: if h.HasSecurityPolicy() { diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 3bf07f75ad..236e64d4bd 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -978,7 +978,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { errSimulatedFailure := errors.New("simulated failure") - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchTarget, true, true, "xfs") }) if err != nil { @@ -993,7 +993,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { target := testDataGenerator.uniqueLayerMountTarget() layerPaths[len(tc.Layers)-i-1] = target var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) if policyErr != nil { return policyErr @@ -1012,7 +1012,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { overlayTarget := getOverlayMountTarget(tid) var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPaths, overlayTarget) if commitOnEnforcementFailure { return nil @@ -1020,7 +1020,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { return policyErr }) if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) return false } if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPaths), "no matching containers for overlay")...) { @@ -1034,7 +1034,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, tid, layerPathsWithoutErr, overlayTarget) if commitOnEnforcementFailure { return nil @@ -1042,7 +1042,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { return policyErr }) if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) return false } if !assertDecisionJSONContains(t, policyErr, append(slices.Clone(layerPathsWithoutErr), "no matching containers for overlay")...) { @@ -1050,7 +1050,7 @@ func Test_Rego_EnforceOverlayMountPolicy_MountFail(t *testing.T) { } retryTarget := layerPaths[errLayerPathIndex] - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceDeviceMountPolicy(gc.ctx, retryTarget, tc.Layers[layerToErr]) }) if err != nil { @@ -6241,7 +6241,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { errSimulatedFailure := errors.New("simulated failure") scratchMountTarget := getScratchDiskMountTarget(containerID) - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") }) if err != nil { @@ -6251,7 +6251,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { overlayTarget := getOverlayMountTarget(containerID) var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceOverlayMountPolicy(gc.ctx, containerID, layers, overlayTarget) if policyErr != nil { return policyErr @@ -6264,7 +6264,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) if commitOnEnforcementFailure { return nil @@ -6276,12 +6276,12 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } if err != nil && commitOnEnforcementFailure { - t.Errorf("Expected WithTransaction to not return an error, but got: %v", err) + t.Errorf("Expected WithMetadataRollback to not return an error, but got: %v", err) return false } // "Retry" overlay mount - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceOverlayMountPolicy(gc.ctx, tc.containerID, layers, overlayTarget) }) if err != nil { @@ -6289,7 +6289,7 @@ func Test_Rego_EnforceCreateContainer_RejectRevertedOverlayMount(t *testing.T) { return false } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { _, _, _, err = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) return err }) @@ -6333,7 +6333,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { scratchMountTarget := getScratchDiskMountTarget(containerID) var policyErr error - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceRWDeviceMountPolicy(gc.ctx, scratchMountTarget, true, true, "xfs") if policyErr != nil { return policyErr @@ -6356,7 +6356,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { layerToErr := testRand.Intn(len(container.Layers)) for i, layerHash := range container.Layers { target := testDataGenerator.uniqueLayerMountTarget() - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { policyErr = policy.EnforceDeviceMountPolicy(gc.ctx, target, layerHash) if policyErr != nil { return policyErr @@ -6380,7 +6380,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } for _, layerPath := range succeedLayerPaths { - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceDeviceUnmountPolicy(gc.ctx, layerPath) }) if err != nil { @@ -6389,7 +6389,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } } - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { return policy.EnforceRWDeviceUnmountPolicy(gc.ctx, scratchMountTarget) }) if err != nil { @@ -6399,7 +6399,7 @@ func Test_Rego_EnforceCreateContainer_RetryEverything(t *testing.T) { } if testDenyInvalidContainerCreation { - err = policy.WithTransaction(func() error { + err = policy.WithMetadataRollback(func() error { _, _, _, policyErr = policy.EnforceCreateContainerPolicy(gc.ctx, tc.sandboxID, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, false, tc.noNewPrivileges, tc.user, tc.groups, tc.umask, tc.capabilities, tc.seccomp) return policyErr }) diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index bd48373cbc..3f94bae0f5 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -128,7 +128,7 @@ type SecurityPolicyEnforcer interface { GetUserInfo(spec *oci.Process, rootPath string) (IDName, []IDName, string, error) EnforceVerifiedCIMsPolicy(ctx context.Context, containerID string, layerHashes []string, mountedCim []string) (err error) EnforceRegistryChangesPolicy(ctx context.Context, containerID string, registryValues interface{}) error - WithTransaction(fn func() error) error + WithMetadataRollback(fn func() error) error } //nolint:unused @@ -325,7 +325,7 @@ func (OpenDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context.C return nil } -func (OpenDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { +func (OpenDoorSecurityPolicyEnforcer) WithMetadataRollback(fn func() error) error { return fn() } @@ -458,6 +458,6 @@ func (ClosedDoorSecurityPolicyEnforcer) EnforceRegistryChangesPolicy(ctx context return errors.New("registry changes are denied by policy") } -func (ClosedDoorSecurityPolicyEnforcer) WithTransaction(fn func() error) error { +func (ClosedDoorSecurityPolicyEnforcer) WithMetadataRollback(fn func() error) error { return fn() } diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index a665f06450..a8832dc423 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -1196,10 +1196,10 @@ func (policy *regoEnforcer) GetUserInfo(process *oci.Process, rootPath string) ( return GetAllUserInfo(process, rootPath) } -// WithTransaction snapshots metadata, runs fn and rolls metadata back if fn +// WithMetadataRollback snapshots metadata, runs fn and rolls metadata back if fn // returns an error. Nested or concurrent transactions are rejected. Returns // the error from fn if it fails. -func (policy *regoEnforcer) WithTransaction(fn func() error) error { +func (policy *regoEnforcer) WithMetadataRollback(fn func() error) error { if !policy.transactionLock.TryLock() { return errors.New("nested or concurrent policy transactions are not supported") } From 5d8920e5a6755227079432e9486cda41203c99b5 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Wed, 10 Jun 2026 23:53:16 +0000 Subject: [PATCH 11/12] Fix lint Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 20d25ad194..0cd6448cae 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -151,7 +151,7 @@ func (u *uvmConsistencyError) Check() error { return nil } return fmt.Errorf( - "Mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", + "mount, unmount, container creation and deletion have been disabled in this UVM due to a previous error: %w", u.cause, ) } From c03fd7b7f7e9b24f614c78b263b9c55e403541eb Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Thu, 11 Jun 2026 13:55:47 +0000 Subject: [PATCH 12/12] Remove UnrecoverableError and use plain panic instead. ContainerPlatform pushed back against using an infinite loop due to concerns with IcMs. The plan is to simply panic now, then in a future PR make the gcs-sidecar a critical process Signed-off-by: Tingmao Wang --- internal/gcs/unrecoverable_error.go | 53 ------------------- .../securitypolicyenforcer_rego.go | 4 +- 2 files changed, 1 insertion(+), 56 deletions(-) delete mode 100644 internal/gcs/unrecoverable_error.go diff --git a/internal/gcs/unrecoverable_error.go b/internal/gcs/unrecoverable_error.go deleted file mode 100644 index 96528088aa..0000000000 --- a/internal/gcs/unrecoverable_error.go +++ /dev/null @@ -1,53 +0,0 @@ -package gcs - -import ( - "context" - "fmt" - "os" - "runtime" - "time" - - "github.com/Microsoft/hcsshim/internal/log" - "github.com/Microsoft/hcsshim/pkg/amdsevsnp" - "github.com/sirupsen/logrus" -) - -// UnrecoverableError logs the error and then puts the current thread into an -// infinite sleep loop. This is to be used instead of panicking, as the -// behaviour of GCS panics is unpredictable. This function can be extended to, -// for example, try to shutdown the VM cleanly. -func UnrecoverableError(err error) { - buf := make([]byte, 300*(1<<10)) - stackSize := runtime.Stack(buf, true) - stackTrace := string(buf[:stackSize]) - - errPrint := fmt.Sprintf( - "Unrecoverable error in GCS: %v\n%s", - err, stackTrace, - ) - - isSnp, err := amdsevsnp.IsSNP() - if err != nil { - // IsSNP() cannot fail on LCOW - // but if it does, we proceed as if we're on SNP to be safe. - isSnp = true - } - - if isSnp { - errPrint += "\nThis thread will now enter an infinite loop." - } - log.G(context.Background()).WithError(err).Logf( - logrus.FatalLevel, - "%s", - errPrint, - ) - - if !isSnp { - panic("Unrecoverable error in GCS: " + err.Error()) - } else { - fmt.Fprintf(os.Stderr, "%s\n", errPrint) - for { - time.Sleep(time.Hour) - } - } -} diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index a8832dc423..6cd911b5ec 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -15,7 +15,6 @@ import ( "sync" "syscall" - "github.com/Microsoft/hcsshim/internal/gcs" "github.com/Microsoft/hcsshim/internal/guestpath" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -1213,8 +1212,7 @@ func (policy *regoEnforcer) WithMetadataRollback(fn func() error) error { err = fn() if err != nil { if restoreErr := policy.rego.RestoreMetadata(saved); restoreErr != nil { - gcs.UnrecoverableError(errors.Wrapf(restoreErr, - "failed to rollback policy metadata after error: %v", err)) + panic(fmt.Sprintf("failed to rollback policy metadata: %v (caused by error: %v)", restoreErr, err)) } log.G(context.Background()).WithError(err).Warn("rolled back policy metadata due to error") return err