diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index a59618137b..7adc6fc389 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -931,7 +931,11 @@ func (ht *hcsTask) updateTaskContainerResources(ctx context.Context, data interf func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.WindowsCPUResources) error { // if host is 20h2+ then we can make a request directly to hcs if osversion.Get().Build >= osversion.V20H2 { + // Count/Maximum/Shares live on the HCS Processor schema. Only send a modify + // request when at least one of them is set, so an affinity-only update does + // not push an empty (no-op) request to HCS. req := &hcsschema.Processor{} + hasRateControl := false if cpu.Count != nil { procCount := int32(*cpu.Count) hostProcs := processorinfo.ProcessorCount() @@ -939,23 +943,71 @@ func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.Window hostProcs = ht.host.ProcessorCount() } req.Count = hcsoci.NormalizeProcessorCount(ctx, ht.id, procCount, hostProcs) + hasRateControl = true } if cpu.Maximum != nil { req.Maximum = int32(*cpu.Maximum) + hasRateControl = true } if cpu.Shares != nil { req.Weight = int32(*cpu.Shares) + hasRateControl = true } - return ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req) + if hasRateControl { + if err := ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req); err != nil { + return err + } + } + + // CPU affinity is not part of the HCS Processor schema, so it has to be + // applied out of band (the silo's job object for Argon). A no-op when unset. + if len(cpu.Affinity) > 0 { + return ht.updateWCOWContainerCPUAffinity(ctx, cpu.Affinity) + } + return nil } return errdefs.ErrNotImplemented } +// updateWCOWContainerCPUAffinity honors a post-start change to +// spec.Windows.Resources.CPU.Affinity for an HCS-backed WCOW container. +// +// For process-isolated (Argon) containers this re-pins the silo's job object, using +// the same race-free mechanism as create-time: the Windows kernel re-applies the new +// mask to every process already in the silo and to every future joiner. +// +// Hypervisor-isolated (Xenon) containers require swapping the UVM's CPU group instead; +// that is not yet implemented, so this returns ErrNotImplemented rather than silently +// dropping the request. +func (ht *hcsTask) updateWCOWContainerCPUAffinity(ctx context.Context, affinity []specs.WindowsCPUGroupAffinity) error { + validated, err := hcsoci.ValidateCPUAffinityEntries(affinity) + if err != nil { + return err + } + if len(validated) == 0 { + return nil + } + + if ht.host != nil { + // Xenon: UVM-level CPU-group swap is out of scope here (Track A). + return fmt.Errorf("cpu affinity update for hypervisor-isolated containers is not supported: %w", errdefs.ErrNotImplemented) + } + + // ht.c speaks the cow.Container interface; the underlying implementation + // (an Argon silo) honors affinity, while others return ErrNotImplemented. + return ht.c.SetCPUGroupAffinities(ctx, hcsoci.ToJobObjectAffinities(validated)) +} + func isValidWindowsCPUResources(c *specs.WindowsCPUResources) bool { - return (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) || + // Exactly one of the mutually-exclusive rate controls (Count/Shares/Maximum). + exactlyOneRateControl := (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) || (c.Shares != nil && (c.Count == nil && c.Maximum == nil)) || (c.Maximum != nil && (c.Count == nil && c.Shares == nil)) + // An affinity-only update carries no rate control; accept it on its own so that + // CPU affinity can be changed after the container has started. + affinityOnly := len(c.Affinity) > 0 && c.Count == nil && c.Shares == nil && c.Maximum == nil + return exactlyOneRateControl || affinityOnly } func (ht *hcsTask) updateWCOWResources(ctx context.Context, resources *specs.WindowsResources, annotations map[string]string) error { diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go index beb58ffc50..d922d8e2f3 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go @@ -4,12 +4,14 @@ package main import ( "context" + "errors" "math/rand" "reflect" "strconv" "testing" "time" + "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/pkg/annotations" "github.com/containerd/errdefs" "github.com/opencontainers/runtime-spec/specs-go" @@ -506,3 +508,45 @@ func Test_handleProcessArgsForIsolatedJobContainer(t *testing.T) { }) } } + +func u64(v uint64) *uint64 { return &v } +func u16(v uint16) *uint16 { return &v } + +func Test_isValidWindowsCPUResources(t *testing.T) { + affinity := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}} + for _, tt := range []struct { + name string + c *specs.WindowsCPUResources + want bool + }{ + {"count only", &specs.WindowsCPUResources{Count: u64(2)}, true}, + {"shares only", &specs.WindowsCPUResources{Shares: u16(100)}, true}, + {"maximum only", &specs.WindowsCPUResources{Maximum: u16(5000)}, true}, + {"count and shares", &specs.WindowsCPUResources{Count: u64(2), Shares: u16(100)}, false}, + {"affinity only", &specs.WindowsCPUResources{Affinity: affinity}, true}, + {"affinity with count", &specs.WindowsCPUResources{Count: u64(2), Affinity: affinity}, true}, + {"empty", &specs.WindowsCPUResources{}, false}, + } { + t.Run(tt.name, func(t *testing.T) { + if got := isValidWindowsCPUResources(tt.c); got != tt.want { + t.Fatalf("isValidWindowsCPUResources(%+v) = %v, want %v", tt.c, got, tt.want) + } + }) + } +} + +func Test_hcsTask_updateWCOWContainerCPUAffinity_NoAffinity(t *testing.T) { + ht := &hcsTask{id: t.Name()} + // An empty affinity slice is a no-op and must not require an HCS-backed container. + if err := ht.updateWCOWContainerCPUAffinity(context.Background(), nil); err != nil { + t.Fatalf("expected nil error for empty affinity, got %v", err) + } +} + +func Test_hcsTask_updateWCOWContainerCPUAffinity_XenonNotImplemented(t *testing.T) { + ht := &hcsTask{id: t.Name(), host: &uvm.UtilityVM{}} + err := ht.updateWCOWContainerCPUAffinity(context.Background(), []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x1}}) + if !errors.Is(err, errdefs.ErrNotImplemented) { + t.Fatalf("expected ErrNotImplemented for hypervisor-isolated container, got %v", err) + } +} diff --git a/internal/cow/cow.go b/internal/cow/cow.go index b60cd383b6..885668d75a 100644 --- a/internal/cow/cow.go +++ b/internal/cow/cow.go @@ -8,6 +8,7 @@ import ( "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/jobobject" ) // Process is the interface for an OS process running in a container or utility VM. @@ -96,4 +97,10 @@ type Container interface { WaitError() error // Modify sends a request to modify container resources Modify(ctx context.Context, config interface{}) error + // SetCPUGroupAffinities pins the container's processes to the given CPU + // group affinities. It exists because CPU affinity is not part of the HCS + // container Processor schema and must be applied out of band (on the silo's + // job object for process-isolated Windows containers). Implementations that + // do not support setting CPU affinity return errdefs.ErrNotImplemented. + SetCPUGroupAffinities(ctx context.Context, affinities []jobobject.GroupAffinity) error } diff --git a/internal/gcs/container.go b/internal/gcs/container.go index 549abd35a2..96bd37ed59 100644 --- a/internal/gcs/container.go +++ b/internal/gcs/container.go @@ -5,6 +5,7 @@ package gcs import ( "context" "errors" + "fmt" "sync" "time" @@ -12,8 +13,10 @@ import ( "github.com/Microsoft/hcsshim/internal/gcs/prot" "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/jobobject" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" + "github.com/containerd/errdefs" "go.opencensus.io/trace" ) @@ -138,6 +141,13 @@ func (c *Container) Modify(ctx context.Context, config interface{}) (err error) return c.gc.brdg.RPC(ctx, prot.RPCModifySettings, &req, &resp, false) } +// SetCPUGroupAffinities implements the cow.Container interface. CPU affinity is +// applied on the host's silo job object, which a guest-side container does not +// own, so this returns ErrNotImplemented. +func (c *Container) SetCPUGroupAffinities(_ context.Context, _ []jobobject.GroupAffinity) error { + return fmt.Errorf("cpu affinity is not supported for guest containers: %w", errdefs.ErrNotImplemented) +} + // Properties returns the requested container properties targeting a V1 schema prot.Container. func (c *Container) Properties(ctx context.Context, types ...schema1.PropertyType) (_ *schema1.ContainerProperties, err error) { ctx, span := oc.StartSpan(ctx, "gcs::Container::Properties", oc.WithClientSpanKind) diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 869a5f3e7a..49e9d785fb 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -424,6 +424,20 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } +// openSilo opens the container's server silo job object by its well-known name +// (`\Container_`). HCS owns the silo; the only way to open it from the shim is +// by name, and only while running as SYSTEM. The caller owns the returned handle and +// must Close it. +// +// In the future we can make use of some new functionality in HCS that allows you to +// pass a job object for HCS to use for the container. +func (computeSystem *System) openSilo(ctx context.Context) (*jobobject.JobObject, error) { + return jobobject.Open(ctx, &jobobject.Options{ + UseNTVariant: true, + Name: siloNameFmt(computeSystem.id), + }) +} + // queryInProc handles querying for container properties without reaching out to HCS. `props` // will be updated to contain any data returned from the queries present in `types`. If any properties // failed to be queried they will be tallied up and returned in as the first return value. Failures on @@ -434,14 +448,7 @@ func (computeSystem *System) queryInProc( props *hcsschema.Properties, types []hcsschema.PropertyType, ) ([]hcsschema.PropertyType, error) { - // In the future we can make use of some new functionality in the HCS that allows you - // to pass a job object for HCS to use for the container. Currently, the only way we'll - // be able to open the job/silo is if we're running as SYSTEM. - jobOptions := &jobobject.Options{ - UseNTVariant: true, - Name: siloNameFmt(computeSystem.id), - } - job, err := jobobject.Open(ctx, jobOptions) + job, err := computeSystem.openSilo(ctx) if err != nil { return nil, err } @@ -535,6 +542,48 @@ func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcssch }, nil } +// SetCPUGroupAffinities pins the container's server silo to the given processor +// group affinities. HCS does not expose a CPU-affinity field on the container Processor +// schema, so for process-isolated (Argon) containers we set the affinity directly on the +// silo's job object via SetInformationJobObject(JobObjectGroupInformationEx). +// +// HCS owns the silo; we only open a transient handle (by the silo's well-known job name, +// the same handle queryInProc opens) to record the affinity property. The kernel enforces +// it on every process that joins the silo via AssignProcessToJobObject — including the init +// process at Start and any descendants it spawns. +// +// This must be called after the compute system is created but before it is started, so the +// affinity is already recorded on the job when HCS assigns the init process. Applying it to +// an already-running silo is also safe: the kernel re-applies the mask to current members and +// migrates threads at the next scheduling dispatch. +// +// It implements the cow.Container interface. +func (computeSystem *System) SetCPUGroupAffinities(ctx context.Context, affinities []jobobject.GroupAffinity) error { + computeSystem.handleLock.RLock() + defer computeSystem.handleLock.RUnlock() + + // Guard the compute system's lifecycle while we touch its silo: the RLock blocks + // a concurrent Close(), and handle == 0 means it is already torn down. + if computeSystem.handle == 0 { + return fmt.Errorf("set cpu group affinities on %s silo: %w", computeSystem.ID(), ErrAlreadyClosed) + } + // The silo job object only exists for containers, not VM-based compute systems. + if computeSystem.typ != "container" { + return fmt.Errorf("cpu group affinities are only supported on container compute systems, got %q", computeSystem.typ) + } + + job, err := computeSystem.openSilo(ctx) + if err != nil { + return fmt.Errorf("open %s silo: %w", computeSystem.ID(), err) + } + defer job.Close() + + if err := job.SetCPUGroupAffinities(affinities); err != nil { + return fmt.Errorf("set cpu group affinities on %s silo: %w", computeSystem.ID(), err) + } + return nil +} + // hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types. func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) { operation := "hcs::System::PropertiesV2" diff --git a/internal/hcsoci/cpuaffinity.go b/internal/hcsoci/cpuaffinity.go new file mode 100644 index 0000000000..f794259103 --- /dev/null +++ b/internal/hcsoci/cpuaffinity.go @@ -0,0 +1,130 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "context" + "errors" + "fmt" + + specs "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/internal/hcs" + "github.com/Microsoft/hcsshim/internal/jobobject" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/osversion" +) + +// This file holds the package's CPU affinity code for +// spec.Windows.Resources.CPU.Affinity. Most of it is container-kind-agnostic +// validation and conversion (ValidateCPUAffinity / ValidateCPUAffinityEntries / +// ToJobObjectAffinities), shared with the HostProcess path in +// internal/jobcontainers so the logic is not duplicated across packages. The +// Argon-specific create-time glue (applyArgonCPUAffinity) lives here too. + +// Sentinel errors returned by ValidateCPUAffinity / ValidateCPUAffinityEntries. +var ( + // ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group + // affinity entries are requested on a host older than Windows Server 2022 (build 20348), + // which does not support multi-group affinity for job object silos. + // On Windows Server 2022+, multiple processor groups are fully supported. + ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later") + // ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is + // requested on a host older than Windows Server 2022 (build 20348). + // On Windows Server 2022+, non-zero processor groups are fully supported. + ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later") + // ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask, + // which would select no processors and is always invalid. + ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero") +) + +// ValidateCPUAffinity handles the logic of validating the container's CPU affinity +// specified in the OCI spec. +// +// Returns the validated affinity entries (nil if not specified) and any validation error. +// Multiple processor groups and non-zero group numbers require Windows Server 2022 +// (build 20348) or later; on older hosts only a single entry for group 0 is accepted. +func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) { + if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil { + return nil, nil + } + return ValidateCPUAffinityEntries(spec.Windows.Resources.CPU.Affinity) +} + +// ValidateCPUAffinityEntries validates a set of OCI CPU affinity entries directly, +// applying the same rules as ValidateCPUAffinity. It is used on the container update +// path, where the affinity is supplied as a bare slice rather than a full spec. +// +// Returns the validated entries (nil if empty) and any validation error. +func ValidateCPUAffinityEntries(affinity []specs.WindowsCPUGroupAffinity) ([]specs.WindowsCPUGroupAffinity, error) { + if len(affinity) == 0 { + return nil, nil + } + + // Zero masks are never valid regardless of OS version. + for i, a := range affinity { + if a.Mask == 0 { + return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i) + } + } + + // Determine whether multi-group features are needed: either multiple entries, + // or a single entry targeting a non-zero processor group. + multiGroup := len(affinity) > 1 || affinity[0].Group != 0 + + // Multiple processor groups are only supported on Windows Server 2022+. + if multiGroup && osversion.Build() < osversion.LTSC2022 { + if len(affinity) > 1 { + return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity)) + } + return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group) + } + + return affinity, nil +} + +// ToJobObjectAffinities converts validated OCI CPU affinity entries into the +// jobobject.GroupAffinity representation used by the Win32 job-object APIs. +// +// The input is expected to already have been run through ValidateCPUAffinity. +func ToJobObjectAffinities(affinities []specs.WindowsCPUGroupAffinity) []jobobject.GroupAffinity { + if len(affinities) == 0 { + return nil + } + out := make([]jobobject.GroupAffinity, len(affinities)) + for i, a := range affinities { + out[i] = jobobject.GroupAffinity{ + Mask: a.Mask, + Group: uint16(a.Group), + } + } + return out +} + +// applyArgonCPUAffinity honors spec.Windows.Resources.CPU.Affinity for a +// process-isolated (Argon) container by pinning the container's server silo. +// +// HCS ignores CPU affinity on the container Processor schema (Count/Maximum/Weight), +// so instead we set the affinity on the silo's job object directly. This must run +// after the compute system is created but before it is started, so the affinity is +// already recorded on the job when HCS assigns the init process to the silo. See +// (*hcs.System).SetCPUGroupAffinities for the race-free timeline. +// +// If the spec requests no affinity this is a no-op. +func applyArgonCPUAffinity(ctx context.Context, system *hcs.System, coi *createOptionsInternal) error { + affinities, err := ValidateCPUAffinity(coi.Spec) + if err != nil { + return err + } + if len(affinities) == 0 { + return nil + } + + if err := system.SetCPUGroupAffinities(ctx, ToJobObjectAffinities(affinities)); err != nil { + return fmt.Errorf("apply CPU affinity to container silo: %w", err) + } + + log.G(ctx).WithField("affinities", affinities).Debug("applied CPU affinity to Argon container silo") + return nil +} diff --git a/internal/hcsoci/cpuaffinity_test.go b/internal/hcsoci/cpuaffinity_test.go new file mode 100644 index 0000000000..c74c63d3e4 --- /dev/null +++ b/internal/hcsoci/cpuaffinity_test.go @@ -0,0 +1,83 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "errors" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/internal/jobobject" +) + +func TestValidateCPUAffinityEntries(t *testing.T) { + // A zero mask is invalid on every OS version, so this case is host-independent. + if _, err := ValidateCPUAffinityEntries([]specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0}}); !errors.Is(err, ErrCPUAffinityMaskZero) { + t.Fatalf("zero mask: got %v, want %v", err, ErrCPUAffinityMaskZero) + } + + // Empty input validates to no entries (no affinity requested). + got, err := ValidateCPUAffinityEntries(nil) + if err != nil || got != nil { + t.Fatalf("nil input: got (%v, %v), want (nil, nil)", got, err) + } + + // A single group-0 entry with a non-zero mask is valid regardless of OS version. + in := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}} + got, err = ValidateCPUAffinityEntries(in) + if err != nil { + t.Fatalf("group-0 single entry: unexpected error %v", err) + } + if len(got) != 1 || got[0] != in[0] { + t.Fatalf("group-0 single entry: got %+v, want %+v", got, in) + } +} + +func TestToJobObjectAffinities(t *testing.T) { + for _, tc := range []struct { + name string + in []specs.WindowsCPUGroupAffinity + want []jobobject.GroupAffinity + }{ + { + name: "nil", + in: nil, + want: nil, + }, + { + name: "empty", + in: []specs.WindowsCPUGroupAffinity{}, + want: nil, + }, + { + name: "single group", + in: []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0b1011}}, + want: []jobobject.GroupAffinity{{Group: 0, Mask: 0b1011}}, + }, + { + name: "multiple groups", + in: []specs.WindowsCPUGroupAffinity{ + {Group: 0, Mask: 0xff}, + {Group: 1, Mask: 0x1}, + }, + want: []jobobject.GroupAffinity{ + {Group: 0, Mask: 0xff}, + {Group: 1, Mask: 0x1}, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + got := ToJobObjectAffinities(tc.in) + if len(got) != len(tc.want) { + t.Fatalf("got %d entries, want %d", len(got), len(tc.want)) + } + for i := range got { + if got[i] != tc.want[i] { + t.Errorf("entry %d: got %+v, want %+v", i, got[i], tc.want[i]) + } + } + }) + } +} diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index 5288932fa1..d59007e29b 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -357,6 +357,18 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C if err != nil { return nil, r, err } + + // Process-isolated (Argon) containers run in a server silo on the host. HCS does not + // have CPU affinity on the container Processor schema, so pin the silo's job object + // directly, after create but before the caller starts the container. Only the modern + // V2 schema is handled; legacy V1 Argon and Xenon (UVM-backed) containers are out of + // scope here (Xenon is handled at the UVM layer). + if coi.isV2Argon() { + if err := applyArgonCPUAffinity(ctx, system, coi); err != nil { + return nil, r, err + } + } + return system, r, nil } diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 52f01e2ab6..3da8bd27be 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -32,22 +32,6 @@ import ( const createContainerSubdirectoryForProcessDumpSuffix = "{container_id}" -// Sentinel errors returned by ValidateCPUAffinity. -var ( - // ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group - // affinity entries are requested on a host older than Windows Server 2022 (build 20348), - // which does not support multi-group affinity for job object silos. - // On Windows Server 2022+, multiple processor groups are fully supported. - ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later") - // ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is - // requested on a host older than Windows Server 2022 (build 20348). - // On Windows Server 2022+, non-zero processor groups are fully supported. - ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later") - // ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask, - // which would select no processors and is always invalid. - ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero") -) - // A simple wrapper struct around the container mount configs that should be added to the // container. type mountsConfig struct { @@ -111,41 +95,6 @@ func createMountsConfig(ctx context.Context, coi *createOptionsInternal) (*mount return &config, nil } -// ValidateCPUAffinity handles the logic of validating the container's CPU affinity -// specified in the OCI spec. -// -// Returns the validated affinity entries (nil if not specified) and any validation error. -// Multiple processor groups and non-zero group numbers require Windows Server 2022 -// (build 20348) or later; on older hosts only a single entry for group 0 is accepted. -func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) { - if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil || len(spec.Windows.Resources.CPU.Affinity) == 0 { - return nil, nil - } - - affinity := spec.Windows.Resources.CPU.Affinity - - // Zero masks are never valid regardless of OS version. - for i, a := range affinity { - if a.Mask == 0 { - return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i) - } - } - - // Determine whether multi-group features are needed: either multiple entries, - // or a single entry targeting a non-zero processor group. - multiGroup := len(affinity) > 1 || affinity[0].Group != 0 - - // Multiple processor groups are only supported on Windows Server 2022+. - if multiGroup && osversion.Build() < osversion.LTSC2022 { - if len(affinity) > 1 { - return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity)) - } - return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group) - } - - return affinity, nil -} - // ConvertCPULimits handles the logic of converting and validating the containers CPU limits // specified in the OCI spec to what HCS expects. // diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index 5fd5b2dc29..d0b560f9d2 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -26,6 +26,7 @@ import ( "github.com/Microsoft/hcsshim/internal/queue" "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/winapi" + "github.com/containerd/errdefs" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "golang.org/x/sys/windows" @@ -425,6 +426,13 @@ func (c *JobContainer) Modify(ctx context.Context, config interface{}) (err erro return errors.New("modify not supported for job containers") } +// SetCPUGroupAffinities implements the cow.Container interface. Job (HostProcess) +// containers apply CPU affinity at create time rather than via a post-start +// update, so this returns ErrNotImplemented. +func (c *JobContainer) SetCPUGroupAffinities(_ context.Context, _ []jobobject.GroupAffinity) error { + return fmt.Errorf("cpu affinity update is not supported for job containers: %w", errdefs.ErrNotImplemented) +} + // Start starts the container. There's nothing to "start" for job containers, so this just // sets the start timestamp. func (c *JobContainer) Start(ctx context.Context) error { diff --git a/internal/jobcontainers/oci.go b/internal/jobcontainers/oci.go index b0b07927dc..fa0b1bd276 100644 --- a/internal/jobcontainers/oci.go +++ b/internal/jobcontainers/oci.go @@ -46,16 +46,7 @@ func specToLimits(ctx context.Context, cid string, s *specs.Spec) (*jobobject.Jo if err != nil { return nil, err } - var groupAffinities []jobobject.GroupAffinity - if len(affinities) > 0 { - groupAffinities = make([]jobobject.GroupAffinity, len(affinities)) - for i, a := range affinities { - groupAffinities[i] = jobobject.GroupAffinity{ - Mask: a.Mask, - Group: uint16(a.Group), - } - } - } + groupAffinities := hcsoci.ToJobObjectAffinities(affinities) realCPULimit, realCPUWeight := uint32(cpuLimit), uint32(cpuWeight) if cpuCount != 0 { diff --git a/internal/winapi/cpuaffinity.go b/internal/winapi/cpuaffinity.go new file mode 100644 index 0000000000..a7d6281fe5 --- /dev/null +++ b/internal/winapi/cpuaffinity.go @@ -0,0 +1,21 @@ +package winapi + +// BOOL GetProcessGroupAffinity( +// [in] HANDLE hProcess, +// [in, out] PUSHORT GroupCount, +// [out] PUSHORT GroupArray +// ); +// +//sys GetProcessGroupAffinity(process windows.Handle, groupCount *uint16, groupArray *uint16) (err error) = kernel32.GetProcessGroupAffinity + +// BOOL GetProcessAffinityMask( +// [in] HANDLE hProcess, +// [out] PDWORD_PTR lpProcessAffinityMask, +// [out] PDWORD_PTR lpSystemAffinityMask +// ); +// +//sys GetProcessAffinityMask(process windows.Handle, processAffinityMask *uintptr, systemAffinityMask *uintptr) (err error) = kernel32.GetProcessAffinityMask + +// WORD GetActiveProcessorGroupCount(); +// +//sys GetActiveProcessorGroupCount() (amount uint16) = kernel32.GetActiveProcessorGroupCount diff --git a/internal/winapi/zsyscall_windows.go b/internal/winapi/zsyscall_windows.go index 28b28d1c69..4a32bcacd6 100644 --- a/internal/winapi/zsyscall_windows.go +++ b/internal/winapi/zsyscall_windows.go @@ -63,6 +63,9 @@ var ( procCreatePseudoConsole = modkernel32.NewProc("CreatePseudoConsole") procCreateRemoteThread = modkernel32.NewProc("CreateRemoteThread") procGetActiveProcessorCount = modkernel32.NewProc("GetActiveProcessorCount") + procGetActiveProcessorGroupCount = modkernel32.NewProc("GetActiveProcessorGroupCount") + procGetProcessAffinityMask = modkernel32.NewProc("GetProcessAffinityMask") + procGetProcessGroupAffinity = modkernel32.NewProc("GetProcessGroupAffinity") procIsProcessInJob = modkernel32.NewProc("IsProcessInJob") procLocalAlloc = modkernel32.NewProc("LocalAlloc") procLocalFree = modkernel32.NewProc("LocalFree") @@ -270,6 +273,28 @@ func GetActiveProcessorCount(groupNumber uint16) (amount uint32) { return } +func GetActiveProcessorGroupCount() (amount uint16) { + r0, _, _ := syscall.SyscallN(procGetActiveProcessorGroupCount.Addr()) + amount = uint16(r0) + return +} + +func GetProcessAffinityMask(process windows.Handle, processAffinityMask *uintptr, systemAffinityMask *uintptr) (err error) { + r1, _, e1 := syscall.SyscallN(procGetProcessAffinityMask.Addr(), uintptr(process), uintptr(unsafe.Pointer(processAffinityMask)), uintptr(unsafe.Pointer(systemAffinityMask))) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + +func GetProcessGroupAffinity(process windows.Handle, groupCount *uint16, groupArray *uint16) (err error) { + r1, _, e1 := syscall.SyscallN(procGetProcessGroupAffinity.Addr(), uintptr(process), uintptr(unsafe.Pointer(groupCount)), uintptr(unsafe.Pointer(groupArray))) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) { r1, _, e1 := syscall.SyscallN(procIsProcessInJob.Addr(), uintptr(procHandle), uintptr(jobHandle), uintptr(unsafe.Pointer(result))) if r1 == 0 { diff --git a/test/functional/container_affinity_test.go b/test/functional/container_affinity_test.go new file mode 100644 index 0000000000..2c53d1869d --- /dev/null +++ b/test/functional/container_affinity_test.go @@ -0,0 +1,301 @@ +//go:build windows && functional +// +build windows,functional + +package functional + +import ( + "context" + "errors" + "testing" + + "github.com/containerd/containerd/v2/core/containers" + ctrdoci "github.com/containerd/containerd/v2/pkg/oci" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/windows" + + "github.com/Microsoft/hcsshim/internal/jobobject" + "github.com/Microsoft/hcsshim/internal/winapi" + "github.com/Microsoft/hcsshim/osversion" + + testcontainer "github.com/Microsoft/hcsshim/test/internal/container" + testlayers "github.com/Microsoft/hcsshim/test/internal/layers" + testoci "github.com/Microsoft/hcsshim/test/internal/oci" + "github.com/Microsoft/hcsshim/test/internal/util" + "github.com/Microsoft/hcsshim/test/pkg/require" +) + +// Test_Container_CPUAffinity_Argon is the CI-gating functional test for honoring +// spec.Windows.Resources.CPU.Affinity on process-isolated (Argon) WCOW containers +// (commit "hcsoci,hcs,shim: honor CPU affinity for Argon containers"). +// +// It asserts the three layers from the validation strategy, all reachable from this +// one in-process test (the functional suite runs in-process with internal/jobobject +// and as SYSTEM, so it can open the silo job by name): +// +// Layer 1 — the PR wrote the affinity to the silo's job object in the +// create→start window. This is the real regression gate: it fails if +// applyArgonCPUAffinity / SetCPUGroupAffinities regresses. +// Layer 2 — the host's view matches. The NT-variant silo job IS the host object, +// so the same GetCPUGroupAffinities read-back doubles as the host view; +// no second tool is needed. +// Layer 3 — the init process is actually constrained. This is a kernel guarantee +// (the kernel propagates the silo job's affinity onto silo members), not +// hcsshim code. If the affinity cannot be read (OpenProcess / +// GetProcessGroupAffinity fail) the check is skipped, but a genuine +// mismatch is a hard failure: with Layer 1 passing, it points at the +// kernel/silo plumbing rather than this PR. +func Test_Container_CPUAffinity_Argon(t *testing.T) { + requireFeatures(t, featureWCOW) + // Affinity is applied via the silo job object on 20H2+ (the same floor as the + // rest of the WCOW resource-update path). + require.Build(t, osversion.V20H2) + + ctx := util.Context(namespacedContext(context.Background()), t) + + // Group 0 / single-mask works on any host, so it is the default CI case. + t.Run("Group0SingleMask", func(t *testing.T) { + want := []jobobject.GroupAffinity{{Group: 0, Mask: 0x3}} // CPUs 0 and 1. + runArgonAffinityTest(ctx, t, want) + }) + + // A genuine multi-group pin needs a confirmed >1-processor-group host and + // Windows Server 2022+; skip otherwise rather than assert against a topology + // the runner does not have. + t.Run("MultiGroup", func(t *testing.T) { + require.Build(t, osversion.LTSC2022) + if n := activeProcessorGroupCount(t); n < 2 { + t.Skipf("multi-group affinity requires a host with >1 processor group, got %d", n) + } + want := []jobobject.GroupAffinity{ + {Group: 0, Mask: 0x1}, + {Group: 1, Mask: 0x1}, + } + runArgonAffinityTest(ctx, t, want) + }) +} + +// runArgonAffinityTest creates an Argon container pinned to want, then asserts the +// three validation layers. +func runArgonAffinityTest(ctx context.Context, t *testing.T, want []jobobject.GroupAffinity) { + t.Helper() + + cID := testName(t, "container") + scratch := testlayers.WCOWScratchDir(ctx, t, "") + spec := testoci.CreateWindowsSpec(ctx, t, cID, + testoci.DefaultWindowsSpecOpts(cID, + ctrdoci.WithProcessCommandLine(testoci.PingSelfCmd), + testoci.WithWindowsLayerFolders(append(windowsImageLayers(ctx, t), scratch)), + withCPUAffinity(want), + )...) + + // nil host => process-isolated (Argon). Create runs the PR's applyArgonCPUAffinity + // between HCS-create and HCS-start. + c, _, cleanup := testcontainer.Create(ctx, t, nil, spec, cID, hcsOwner) + t.Cleanup(cleanup) + + // Layers 1 & 2, pre-start gate: the affinity is already recorded on the silo job + // before the init process runs, proving "set after create, before start". + assertSiloJobAffinity(ctx, t, cID, want) + + init := testcontainer.StartWithSpec(ctx, t, c, spec.Process, nil) + t.Cleanup(func() { + testcontainer.Kill(ctx, t, c) + testcontainer.Wait(ctx, t, c) + }) + + // Layers 1 & 2 again, now that the silo has a running member. + assertSiloJobAffinity(ctx, t, cID, want) + + // Layer 3 (kernel assertion): the init process inherited the pin. Skipped if the + // affinity cannot be read; a real mismatch fails the test. + assertProcessGroupAffinity(t, uint32(init.Process.Pid()), want) + + // Layer 3, stronger process-level proof for the single-group-0 case: the + // per-CPU process affinity mask must equal the bits we requested. + // GetProcessAffinityMask only returns a meaningful mask when the process lives + // in a single processor group — it reports 0 once the affinity spans groups — + // so this is expressible only here. MultiGroup deliberately stays at + // membership-only (above); its exact masks remain covered by the job-object + // read at Layers 1 & 2. + if len(want) == 1 && want[0].Group == 0 { + assertProcessAffinityMask(t, uint32(init.Process.Pid()), want[0].Mask) + } +} + +// withCPUAffinity returns a SpecOpt that sets spec.Windows.Resources.CPU.Affinity. +func withCPUAffinity(affinities []jobobject.GroupAffinity) ctrdoci.SpecOpts { + return func(_ context.Context, _ ctrdoci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Windows == nil { + s.Windows = &specs.Windows{} + } + if s.Windows.Resources == nil { + s.Windows.Resources = &specs.WindowsResources{} + } + if s.Windows.Resources.CPU == nil { + s.Windows.Resources.CPU = &specs.WindowsCPUResources{} + } + oci := make([]specs.WindowsCPUGroupAffinity, len(affinities)) + for i, a := range affinities { + oci[i] = specs.WindowsCPUGroupAffinity{Group: uint32(a.Group), Mask: a.Mask} + } + s.Windows.Resources.CPU.Affinity = oci + return nil + } +} + +// assertSiloJobAffinity opens the container's server silo job object by its +// well-known name (\Container_) and asserts its CPU group affinities equal want. +// This is the host-side view of the object the PR wrote to (Layers 1 & 2). +func assertSiloJobAffinity(ctx context.Context, t *testing.T, cID string, want []jobobject.GroupAffinity) { + t.Helper() + + job, err := jobobject.Open(ctx, &jobobject.Options{ + UseNTVariant: true, + Name: `\Container_` + cID, + }) + if err != nil { + t.Fatalf("open silo job for %q: %v", cID, err) + } + defer job.Close() + + got, err := job.GetCPUGroupAffinities() + if err != nil { + t.Fatalf("get silo job cpu group affinities: %v", err) + } + assertAffinitiesEqual(t, "silo job object", got, want) +} + +// assertProcessGroupAffinity reads the group affinity the kernel placed on the init +// process and compares it to want. The PR only writes the job object; propagation +// onto silo members is a kernel guarantee. If the affinity cannot be read the check +// is skipped (logged, not failed), but a successful read that omits a pinned group +// is a hard failure. +func assertProcessGroupAffinity(t *testing.T, pid uint32, want []jobobject.GroupAffinity) { + t.Helper() + + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + t.Logf("Layer 3 (kernel) skipped: OpenProcess(%d): %v", pid, err) + return + } + defer windows.CloseHandle(h) //nolint:errcheck + + got, err := getProcessGroupAffinity(h) + if err != nil { + t.Logf("Layer 3 (kernel) skipped: GetProcessGroupAffinity(%d): %v", pid, err) + return + } + + // The process reports the set of groups it may run on; assert every group we + // pinned shows up. We do not compare masks here: the kernel reports the group's + // active-processor mask for the process, not necessarily the bits we requested. + wantGroups := make(map[uint16]struct{}, len(want)) + for _, a := range want { + wantGroups[a.Group] = struct{}{} + } + gotGroups := make(map[uint16]struct{}, len(got)) + for _, g := range got { + gotGroups[g] = struct{}{} + } + for g := range wantGroups { + if _, ok := gotGroups[g]; !ok { + t.Errorf("Layer 3 (kernel): init process not constrained to group %d; process groups = %v", g, got) + } + } +} + +// assertProcessAffinityMask reads the init process's per-CPU affinity mask via +// GetProcessAffinityMask and asserts it equals wantMask. This is a stronger, +// bit-level process check than assertProcessGroupAffinity, but it is only valid +// for a single-group pin: GetProcessAffinityMask returns 0 once the process spans +// more than one processor group, since a single mask can no longer describe the +// pin. The read is skip-on-failure (logged, not failed); a zero mask is treated as +// "unexpected multi-group state" and skipped; a non-zero mismatch is a hard failure. +func assertProcessAffinityMask(t *testing.T, pid uint32, wantMask uint64) { + t.Helper() + + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + t.Logf("Layer 3 (process mask) skipped: OpenProcess(%d): %v", pid, err) + return + } + defer windows.CloseHandle(h) //nolint:errcheck + + got, err := getProcessAffinityMask(h) + if err != nil { + t.Logf("Layer 3 (process mask) skipped: GetProcessAffinityMask(%d): %v", pid, err) + return + } + if got == 0 { + // A zero process mask means the process spans multiple processor groups, + // where GetProcessAffinityMask is not meaningful. The per-group bits are + // already verified by the job-object read at Layers 1 & 2, so skip here. + t.Logf("Layer 3 (process mask) skipped: process affinity mask is 0 (unexpected multi-group state)") + return + } + if got != wantMask { + t.Errorf("Layer 3 (process mask): process affinity mask = %#x, want %#x", got, wantMask) + } +} + +func assertAffinitiesEqual(t *testing.T, what string, got, want []jobobject.GroupAffinity) { + t.Helper() + + // Order-independent compare keyed by group: the OS does not promise to return + // entries in the order they were set. + if len(got) != len(want) { + t.Fatalf("%s affinity: got %+v, want %+v (length mismatch)", what, got, want) + } + byGroup := make(map[uint16]uint64, len(got)) + for _, g := range got { + byGroup[g.Group] = g.Mask + } + for _, w := range want { + mask, ok := byGroup[w.Group] + if !ok { + t.Fatalf("%s affinity: missing group %d; got %+v, want %+v", what, w.Group, got, want) + } + if mask != w.Mask { + t.Fatalf("%s affinity: group %d mask = %#x, want %#x", what, w.Group, mask, w.Mask) + } + } +} + +// getProcessGroupAffinity returns the processor groups the process may run on via +// winapi.GetProcessGroupAffinity (kernel32!GetProcessGroupAffinity). +func getProcessGroupAffinity(h windows.Handle) ([]uint16, error) { + // Probe with a small buffer; the call sets count to the required size and fails + // with ERROR_INSUFFICIENT_BUFFER if it is too small. + groups := make([]uint16, 4) + count := uint16(len(groups)) + for { + err := winapi.GetProcessGroupAffinity(h, &count, &groups[0]) + if err == nil { + return groups[:count], nil + } + if errors.Is(err, windows.ERROR_INSUFFICIENT_BUFFER) && int(count) > len(groups) { + groups = make([]uint16, count) + continue + } + return nil, err + } +} + +// getProcessAffinityMask returns the per-CPU affinity bitmask the process is +// restricted to via winapi.GetProcessAffinityMask (kernel32!GetProcessAffinityMask). +// The kernel reports 0 when the process spans more than one processor group, since a +// single mask cannot describe a multi-group pin. +func getProcessAffinityMask(h windows.Handle) (uint64, error) { + var processMask, systemMask uintptr + if err := winapi.GetProcessAffinityMask(h, &processMask, &systemMask); err != nil { + return 0, err + } + return uint64(processMask), nil +} + +// activeProcessorGroupCount returns the number of active processor groups on the host, +// used to decide whether a multi-group affinity test can run. +func activeProcessorGroupCount(t *testing.T) int { + t.Helper() + return int(winapi.GetActiveProcessorGroupCount()) +}