From 20e969161048b53a4618aa715c75e29e3e797695 Mon Sep 17 00:00:00 2001
From: Rajath Agasthya <ragasthya@nvidia.com>
Date: Wed, 6 May 2026 13:45:12 -0500
Subject: [PATCH] Remove shell dependency from validator pods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NVIDIA's distroless-cc `-dev` tag (the gpu-operator image base) will no
longer be approved as a STIG parent image. The non-`-dev` variant ships
no shell, so the validator daemonsets and workload validation pods —
which wrapped binaries in `sh -c` and used shell-based preStop hooks —
would break on the new base. Re-adding a shell to the image would only
swap one CVE source for another.

Replace shell wrappers with direct binary invocation. The
operator-validator and sandbox-validator init containers invoke
`nvidia-validator` directly. Their pause containers use a new top-level
`--sleep` flag that prints the validator-success message and blocks on
SIGTERM. Workload pod main containers run `nvidia-validator --version`
as a no-op exit-0; the per-workload success message now prints from
`(c *CUDA).runWorkload` and `(p *Plugin).runWorkload` after
`waitForPod` succeeds — surfacing in the operator-validator init
container logs where success is actually established.

For preStop cleanup, add a small static helper `rmglob` that takes
glob patterns and removes matching paths. Modeled on k8s-cc-manager's
vendored static `/bin/rm`, shipped at `/usr/bin/rmglob`. Both
validator daemonsets keep their `lifecycle.preStop` blocks; they now
call this binary instead of `sh -c rm`.

Drop `hack/must-gather.sh` from the image entrypoint at
`/usr/bin/gather`. It depended on `bash`, `kubectl`, and `oc` — none
of which ship in the distroless base. Customers already run the
script from outside the cluster against an existing kubeconfig;
removing the in-image copy doesn't change that workflow.

Flip the Dockerfile base to `nvcr.io/nvidia/distroless/cc:v4.0.4`.

Signed-off-by: Rajath Agasthya <ragasthya@nvidia.com>
---
 .../0500_daemonset.yaml                       | 18 ++---
 .../0500_daemonset.yaml                       | 18 ++---
 cmd/nvidia-validator/main.go                  | 75 +++++++++++++++----
 cmd/nvidia-validator/main_test.go             | 71 ++++++++++++++++++
 cmd/rmglob/main.go                            | 60 +++++++++++++++
 cmd/rmglob/main_test.go                       | 75 +++++++++++++++++++
 docker/Dockerfile                             |  5 +-
 .../manifests/cuda-workload-validation.yaml   |  7 +-
 .../manifests/plugin-workload-validation.yaml |  7 +-
 9 files changed, 288 insertions(+), 48 deletions(-)
 create mode 100644 cmd/rmglob/main.go
 create mode 100644 cmd/rmglob/main_test.go

diff --git a/assets/state-operator-validation/0500_daemonset.yaml b/assets/state-operator-validation/0500_daemonset.yaml
index cddc4f8bb..20247d4f7 100644
--- a/assets/state-operator-validation/0500_daemonset.yaml
+++ b/assets/state-operator-validation/0500_daemonset.yaml
@@ -28,8 +28,7 @@ spec:
       initContainers:
         - name: driver-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
             - name: WITH_WAIT
               value: "true"
@@ -58,8 +57,7 @@ spec:
               mountPath: /host-dev-char
         - name: toolkit-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
           - name: NVIDIA_VISIBLE_DEVICES
             value: "all"
@@ -75,8 +73,7 @@ spec:
               mountPropagation: Bidirectional
         - name: cuda-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
           - name: WITH_WAIT
             value: "false"
@@ -98,8 +95,7 @@ spec:
               mountPropagation: Bidirectional
         - name: plugin-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
           - name: COMPONENT
             value: plugin
@@ -126,14 +122,14 @@ spec:
       containers:
         - image: "FILLED BY THE OPERATOR"
           name: nvidia-operator-validator
-          command: ['sh', '-c']
-          args: ["echo all validations are successful; while true; do sleep 86400; done"]
+          command: ["nvidia-validator"]
+          args: ["--sleep"]
           securityContext:
             privileged: true
           lifecycle:
             preStop:
               exec:
-                command: ["sh", "-c", "rm -f /run/nvidia/validations/*-ready"]
+                command: ["/usr/bin/rmglob", "/run/nvidia/validations/*-ready"]
           volumeMounts:
             - name: run-nvidia-validations
               mountPath: "/run/nvidia/validations"
diff --git a/assets/state-sandbox-validation/0500_daemonset.yaml b/assets/state-sandbox-validation/0500_daemonset.yaml
index fcc2aa12a..b5731a543 100644
--- a/assets/state-sandbox-validation/0500_daemonset.yaml
+++ b/assets/state-sandbox-validation/0500_daemonset.yaml
@@ -28,8 +28,7 @@ spec:
       initContainers:
         - name: cc-manager-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
             - name: WITH_WAIT
               value: "true"
@@ -49,8 +48,7 @@ spec:
               mountPropagation: Bidirectional
         - name: vfio-pci-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
             - name: WITH_WAIT
               value: "true"
@@ -74,8 +72,7 @@ spec:
               mountPropagation: Bidirectional
         - name: vgpu-manager-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
           - name: WITH_WAIT
             value: "true"
@@ -102,8 +99,7 @@ spec:
               mountPropagation: Bidirectional
         - name: vgpu-devices-validation
           image: "FILLED BY THE OPERATOR"
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
+          command: ["nvidia-validator"]
           env:
           - name: WITH_WAIT
             value: "true"
@@ -122,14 +118,14 @@ spec:
       containers:
         - image: "FILLED BY THE OPERATOR"
           name: nvidia-sandbox-validator
-          command: ['sh', '-c']
-          args: ["echo all validations are successful; while true; do sleep 86400; done"]
+          command: ["nvidia-validator"]
+          args: ["--sleep"]
           securityContext:
             privileged: true
           lifecycle:
             preStop:
               exec:
-                command: ["sh", "-c", "rm -f /run/nvidia/validations/*"]
+                command: ["/usr/bin/rmglob", "/run/nvidia/validations/*"]
           volumeMounts:
             - name: run-nvidia-validations
               mountPath: "/run/nvidia/validations"
diff --git a/cmd/nvidia-validator/main.go b/cmd/nvidia-validator/main.go
index 8ffa41e28..c373dc11c 100644
--- a/cmd/nvidia-validator/main.go
+++ b/cmd/nvidia-validator/main.go
@@ -135,6 +135,7 @@ var (
 	hostRootFlag                  string
 	driverInstallDirFlag          string
 	driverInstallDirCtrPathFlag   string
+	sleepFlag                     bool
 )
 
 // defaultGPUWorkloadConfig is "vm-passthrough" unless
@@ -375,14 +376,17 @@ func main() {
 			Destination: &driverInstallDirCtrPathFlag,
 			Sources:     cli.EnvVars("DRIVER_INSTALL_DIR_CTR_PATH"),
 		},
+		&cli.BoolFlag{
+			Name:        "sleep",
+			Usage:       "after any other action, print the validator-success message and block until SIGTERM/SIGINT/SIGHUP, then exit 0",
+			Destination: &sleepFlag,
+			Sources:     cli.EnvVars("SLEEP"),
+		},
 	}
 
 	// Log version info
 	log.Infof("version: %s", c.Version)
 
-	// Handle signals
-	go handleSignal()
-
 	// invoke command
 	err := c.Run(context.Background(), os.Args)
 	if err != nil {
@@ -404,6 +408,10 @@ func handleSignal() {
 
 func validateFlags(ctx context.Context, cli *cli.Command) (context.Context, error) {
 	if componentFlag == "" {
+		// Standalone --sleep mode does not require a component.
+		if sleepFlag {
+			return ctx, nil
+		}
 		return ctx, fmt.Errorf("invalid -c <component-name> flag: must not be empty string")
 	}
 	if !isValidComponent() {
@@ -509,24 +517,59 @@ func getWorkloadConfig(ctx context.Context) (string, error) {
 }
 
 func start(ctx context.Context, cli *cli.Command) error {
-	// if cleanup is requested, delete all existing status files(default)
-	if cleanupAllFlag {
-		// cleanup output directory and create again each time
-		err := os.RemoveAll(outputDirFlag)
-		if err != nil {
-			if !os.IsNotExist(err) {
-				return err
+	// In sleep mode, runSleep installs its own signal handler. Otherwise
+	// preserve legacy behavior: any signal terminates the process.
+	if !sleepFlag {
+		go handleSignal()
+	}
+
+	if componentFlag != "" {
+		// if cleanup is requested, delete all existing status files(default)
+		if cleanupAllFlag {
+			// cleanup output directory and create again each time
+			err := os.RemoveAll(outputDirFlag)
+			if err != nil {
+				if !os.IsNotExist(err) {
+					return err
+				}
 			}
 		}
+
+		// create status directory
+		err := os.Mkdir(outputDirFlag, 0755)
+		if err != nil && !os.IsExist(err) {
+			return err
+		}
+
+		if err := validateComponent(ctx, componentFlag); err != nil {
+			return err
+		}
 	}
 
-	// create status directory
-	err := os.Mkdir(outputDirFlag, 0755)
-	if err != nil && !os.IsExist(err) {
-		return err
+	if sleepFlag {
+		return runSleep(ctx)
 	}
+	return nil
+}
 
-	return validateComponent(ctx, componentFlag)
+// runSleep prints the validator-success message and blocks until a
+// termination signal arrives, then exits cleanly. Per-pod cleanup of
+// status markers is handled separately by the rmglob binary invoked
+// from `lifecycle.preStop`.
+func runSleep(ctx context.Context) error {
+	fmt.Println("all validations are successful")
+
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP)
+	defer signal.Stop(sigCh)
+
+	select {
+	case <-ctx.Done():
+		log.Infof("context canceled")
+	case s := <-sigCh:
+		log.Infof("received signal %s", s)
+	}
+	return nil
 }
 
 func validateComponent(ctx context.Context, componentFlag string) error {
@@ -1368,6 +1411,7 @@ func (p *Plugin) runWorkload() error {
 	if err != nil {
 		return err
 	}
+	fmt.Println("device-plugin workload validation is successful")
 	return nil
 }
 
@@ -1621,6 +1665,7 @@ func (c *CUDA) runWorkload() error {
 	if err != nil {
 		return err
 	}
+	fmt.Println("cuda workload validation is successful")
 	return nil
 }
 
diff --git a/cmd/nvidia-validator/main_test.go b/cmd/nvidia-validator/main_test.go
index d0199dd18..0a84eb9fe 100644
--- a/cmd/nvidia-validator/main_test.go
+++ b/cmd/nvidia-validator/main_test.go
@@ -19,7 +19,9 @@ package main
 import (
 	"context"
 	"os"
+	"syscall"
 	"testing"
+	"time"
 )
 
 func Test_isValidComponent(t *testing.T) {
@@ -216,3 +218,72 @@ UNKNOWN_FEATURE: true`,
 		})
 	}
 }
+
+func Test_validateFlags_standaloneSleep(t *testing.T) {
+	tests := []struct {
+		name      string
+		component string
+		sleep     bool
+		wantErr   bool
+	}{
+		{name: "no component, no sleep: error", wantErr: true},
+		{name: "no component, sleep: ok", sleep: true},
+		{name: "valid component, no sleep: ok", component: "driver"},
+		{name: "valid component, sleep: ok", component: "driver", sleep: true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			origComponent, origSleep := componentFlag, sleepFlag
+			componentFlag, sleepFlag = tt.component, tt.sleep
+			defer func() {
+				componentFlag, sleepFlag = origComponent, origSleep
+			}()
+
+			_, err := validateFlags(context.Background(), nil)
+			if tt.wantErr && err == nil {
+				t.Errorf("validateFlags() expected error, got nil")
+			}
+			if !tt.wantErr && err != nil {
+				t.Errorf("validateFlags() unexpected error: %v", err)
+			}
+		})
+	}
+}
+
+func Test_runSleep_returnsOnSignal(t *testing.T) {
+	errCh := make(chan error, 1)
+	go func() { errCh <- runSleep(context.Background()) }()
+
+	// Give runSleep a moment to install its signal handler before sending.
+	time.Sleep(50 * time.Millisecond)
+	if err := syscall.Kill(syscall.Getpid(), syscall.SIGTERM); err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+
+	select {
+	case err := <-errCh:
+		if err != nil {
+			t.Errorf("runSleep returned error: %v", err)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatalf("runSleep did not return within 2s of SIGTERM")
+	}
+}
+
+func Test_runSleep_contextCancel(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	errCh := make(chan error, 1)
+	go func() { errCh <- runSleep(ctx) }()
+
+	time.Sleep(50 * time.Millisecond)
+	cancel()
+
+	select {
+	case err := <-errCh:
+		if err != nil {
+			t.Errorf("runSleep returned error: %v", err)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatalf("runSleep did not return within 2s of context cancel")
+	}
+}
diff --git a/cmd/rmglob/main.go b/cmd/rmglob/main.go
new file mode 100644
index 000000000..bc3a3dc1e
--- /dev/null
+++ b/cmd/rmglob/main.go
@@ -0,0 +1,60 @@
+/*
+Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// rmglob is a tiny static helper binary that expands one or more glob
+// patterns and removes the matching paths. It exists so that distroless
+// gpu-operator container images can run path cleanup from a Kubernetes
+// `lifecycle.preStop` hook without needing a shell on the image.
+//
+// It is the path-cleanup analog of k8s-cc-manager's vendored static `/bin/rm`.
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+func main() {
+	if len(os.Args) < 2 {
+		fmt.Fprintln(os.Stderr, "usage: rmglob <glob>...")
+		os.Exit(2)
+	}
+
+	var failed bool
+	for _, pattern := range os.Args[1:] {
+		matches, err := filepath.Glob(pattern)
+		if err != nil {
+			//#nosec G705 -- stderr diagnostic, not a network-reachable sink
+			fmt.Fprintf(os.Stderr, "rmglob: invalid pattern %q: %v\n", pattern, err)
+			failed = true
+			continue
+		}
+		for _, m := range matches {
+			// Path removal is the binary's sole purpose; the patterns come from
+			// gpu-operator-rendered manifests, not external user input.
+			//#nosec G703 -- intentional path removal
+			if err := os.RemoveAll(m); err != nil {
+				//#nosec G705 -- stderr diagnostic, not a network-reachable sink
+				fmt.Fprintf(os.Stderr, "rmglob: remove %q: %v\n", m, err)
+				failed = true
+			}
+		}
+	}
+	if failed {
+		os.Exit(1)
+	}
+}
diff --git a/cmd/rmglob/main_test.go b/cmd/rmglob/main_test.go
new file mode 100644
index 000000000..432971c9a
--- /dev/null
+++ b/cmd/rmglob/main_test.go
@@ -0,0 +1,75 @@
+/*
+Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"testing"
+)
+
+var rmglobBin string
+
+func TestMain(m *testing.M) {
+	dir, err := os.MkdirTemp("", "rmglob-test-")
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "tempdir: %v\n", err)
+		os.Exit(2)
+	}
+
+	rmglobBin = filepath.Join(dir, "rmglob")
+	if out, err := exec.Command("go", "build", "-o", rmglobBin, ".").CombinedOutput(); err != nil {
+		fmt.Fprintf(os.Stderr, "build: %v\n%s", err, out)
+		os.RemoveAll(dir)
+		os.Exit(2)
+	}
+	code := m.Run()
+	os.RemoveAll(dir)
+	os.Exit(code)
+}
+
+func TestRmglob(t *testing.T) {
+	tmpDir := t.TempDir()
+	for _, name := range []string{"a-ready", "b-ready", "keep.txt"} {
+		if err := os.WriteFile(filepath.Join(tmpDir, name), []byte("x"), 0600); err != nil {
+			t.Fatalf("write: %v", err)
+		}
+	}
+
+	//#nosec G204 -- test-only invocation of a binary built by TestMain
+	if out, err := exec.Command(rmglobBin, filepath.Join(tmpDir, "*-ready")).CombinedOutput(); err != nil {
+		t.Fatalf("run: %v\n%s", err, out)
+	}
+
+	if _, err := os.Stat(filepath.Join(tmpDir, "a-ready")); !os.IsNotExist(err) {
+		t.Errorf("a-ready should be removed, stat err=%v", err)
+	}
+	if _, err := os.Stat(filepath.Join(tmpDir, "b-ready")); !os.IsNotExist(err) {
+		t.Errorf("b-ready should be removed, stat err=%v", err)
+	}
+	if _, err := os.Stat(filepath.Join(tmpDir, "keep.txt")); err != nil {
+		t.Errorf("keep.txt should remain, stat err=%v", err)
+	}
+}
+
+func TestRmglobNoArgs(t *testing.T) {
+	if err := exec.Command(rmglobBin).Run(); err == nil {
+		t.Errorf("rmglob with no args expected non-zero exit, got 0")
+	}
+}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 121eed9ec..570706023 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -72,7 +72,7 @@ RUN curl -L https://codeload.github.com/NVIDIA/cuda-samples/tar.gz/refs/tags/v${
 
 # The C/C++ distroless image is used as a base since the CUDA vectorAdd
 # sample application depends on C/C++ libraries.
-FROM nvcr.io/nvidia/distroless/cc:v4.0.4-dev
+FROM nvcr.io/nvidia/distroless/cc:v4.0.4
 
 ENV NVIDIA_VISIBLE_DEVICES=void
 
@@ -92,6 +92,7 @@ WORKDIR /
 COPY --from=builder /workspace/gpu-operator /usr/bin/
 COPY --from=builder /workspace/manage-crds /usr/bin/
 COPY --from=builder /workspace/nvidia-validator /usr/bin/
+COPY --from=builder /workspace/rmglob /usr/bin/
 COPY --from=sample-builder /build/vectorAdd /usr/bin/vectorAdd
 ARG CUDA_SAMPLES_VERSION
 COPY --from=sample-builder /usr/local/cuda-${CUDA_SAMPLES_VERSION}/compat /usr/local/cuda/compat
@@ -100,8 +101,6 @@ COPY assets /opt/gpu-operator/
 COPY manifests /opt/gpu-operator/manifests
 COPY validator/manifests /opt/validator/manifests
 
-COPY hack/must-gather.sh /usr/bin/gather
-
 # Add CRD resource into the image for helm upgrades
 COPY deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml /opt/gpu-operator/nvidia.com_clusterpolicies.yaml
 COPY deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
diff --git a/validator/manifests/cuda-workload-validation.yaml b/validator/manifests/cuda-workload-validation.yaml
index fa47df5f7..614454048 100644
--- a/validator/manifests/cuda-workload-validation.yaml
+++ b/validator/manifests/cuda-workload-validation.yaml
@@ -18,8 +18,7 @@ spec:
   - name: cuda-validation
     image: "FILLED_BY_THE_VALIDATOR"
     imagePullPolicy: IfNotPresent
-    command: ['sh', '-c']
-    args: ["vectorAdd"]
+    command: ["vectorAdd"]
     env:
     - name: NVIDIA_VISIBLE_DEVICES
       value: "all"
@@ -30,8 +29,8 @@ spec:
       image: "FILLED_BY_THE_VALIDATOR"
       imagePullPolicy: IfNotPresent
       # override command and args as validation is already done by initContainer
-      command: ['sh', '-c']
-      args: ["echo cuda workload validation is successful"]
+      command: ["nvidia-validator"]
+      args: ["--version"]
       securityContext:
         privileged: true
         readOnlyRootFilesystem: true
diff --git a/validator/manifests/plugin-workload-validation.yaml b/validator/manifests/plugin-workload-validation.yaml
index 80bb657e2..d77551ad4 100644
--- a/validator/manifests/plugin-workload-validation.yaml
+++ b/validator/manifests/plugin-workload-validation.yaml
@@ -16,8 +16,7 @@ spec:
   - name: plugin-validation
     image: "FILLED_BY_VALIDATOR"
     imagePullPolicy: IfNotPresent
-    command: ['sh', '-c']
-    args: ["vectorAdd"]
+    command: ["vectorAdd"]
     securityContext:
       allowPrivilegeEscalation: false
     resources:
@@ -28,8 +27,8 @@ spec:
       image: "FILLED_BY_VALIDATOR"
       imagePullPolicy: IfNotPresent
       # override command and args as validation is already done by initContainer
-      command: ['sh', '-c']
-      args: ["echo device-plugin workload validation is successful"]
+      command: ["nvidia-validator"]
+      args: ["--version"]
       securityContext:
         allowPrivilegeEscalation: false
         readOnlyRootFilesystem: true