diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/.gitignore b/AI/MIGraphX/gpu_resident_yolo26_pipeline/.gitignore new file mode 100644 index 000000000..1a10b0730 --- /dev/null +++ b/AI/MIGraphX/gpu_resident_yolo26_pipeline/.gitignore @@ -0,0 +1,17 @@ +# Build artefacts +*.mxr +*.onnx +yolo26s.pt + +# Run outputs +output*.mp4 +_run_artifacts/ + +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +.venv/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/README.md b/AI/MIGraphX/gpu_resident_yolo26_pipeline/README.md new file mode 100644 index 000000000..9b3c1dc55 --- /dev/null +++ b/AI/MIGraphX/gpu_resident_yolo26_pipeline/README.md @@ -0,0 +1,110 @@ +# rocDecode and MIGraphX Zero-Copy YOLO Video Inference (Python) + +![YOLO26 detections (person, bicycle) drawn on a frame from a peloton of cyclists](images/result_boxes.jpg) + +## Description + +This example keeps a video frame on the GPU from decode through to detection +on AMD hardware. The on-chip VCN engine decodes the bitstream via +[rocDecode](https://rocm.docs.amd.com/projects/rocDecode/en/latest/), +[DLPack](https://github.com/dmlc/dlpack) hands the decoded surface to PyTorch +as a zero-copy view, YOLO preprocessing runs on the active HIP stream, and +[MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/) runs +the compiled YOLO26 graph on that same stream. Only the surviving detections +cross the PCIe bus. An OpenCV CPU-decode path is provided as a baseline. + +## Application Flow + +1. Load the compiled MIGraphX `.mxr` model and open the input video. +2. For every frame: + 1. **Decode.** rocDecode demuxes the bitstream and the VCN engine decodes + to a GPU surface. The `opencv` baseline does this on the CPU. + 2. **DLPack wrap.** The decoded RGB surface is exposed to PyTorch as a + zero-copy `[H, W, 3]` uint8 CUDA tensor. + 3. **Preprocess.** HWC uint8 RGB is permuted to BCHW float32, then + resized and letterbox-padded to 640x640 on the active HIP stream. + 4. **Inference.** `migraphx.run_async` is enqueued on the same stream as + preprocessing, so no implicit synchronization happens between them. + 5. **Postprocess.** Confidence filter and letterbox-to-source remap stay + on-device. Only the surviving boxes are copied back to the host. +3. Draw boxes on each frame and write an MP4 with detections. + +## Key APIs and Concepts + +- **Zero-copy decoder-to-PyTorch handoff.** rocPyDecode exposes its decoded + surface through DLPack, and `torch.from_dlpack` wraps it as a CUDA tensor + that shares the same memory. There is no `hipMemcpy` between decode and + preprocess. +- **Single HIP stream for preprocess and inference.** `Detector` queries + `torch.cuda.current_stream()` once and reuses it for `run_async`, so + preprocess and inference serialize on the GPU without host synchronization. +- **Pre-allocated MIGraphX output.** The output tensor is allocated once + with `torch.empty_strided` and bound to MIGraphX as a raw pointer via + `migraphx.argument_from_pointer`, avoiding per-frame device allocation. + +## Demonstrated API Calls + +### rocDecode (Python bindings) + +- `pyRocVideoDecode.demuxer.demuxer` +- `pyRocVideoDecode.decoder.decoder` +- `decoder.DecodeFrame`, `decoder.GetFrameRgb`, `decoder.ReleaseFrame` +- `demuxer.DemuxFrame`, `demuxer.GetCodecId`, `demuxer.GetBitDepth` + +### MIGraphX (Python) + +- `migraphx.parse_onnx`, `migraphx.quantize_fp16`, `migraphx.save`, + `migraphx.load` +- `migraphx.program.run_async`, `migraphx.argument_from_pointer` +- `migraphx.get_target` + +### PyTorch and DLPack + +- `torch.from_dlpack` +- `torch.cuda.current_stream`, `torch.cuda.Stream.synchronize` +- `torch.empty_strided`, `torch.nn.functional.interpolate`, + `torch.nn.functional.pad` + +## Setup + +Run inside the official ROCm PyTorch container. PyTorch and MIGraphX are +pre-installed; rocDecode and its libva backend are installed below. + +```bash +docker run --rm -it \ + --device=/dev/kfd --device=/dev/dri \ + --group-add video --ipc=host --shm-size=8g \ + -v "$PWD":/workspace -w /workspace \ + rocm/pytorch:rocm7.2.2_ubuntu22.04_py3.10_pytorch_release_2.10.0 +``` + +Inside the container: + +```bash +# The AMDGPU "graphics" repo provides libva-amdgpu / mesa-amdgpu VA drivers +# that rocDecode needs at runtime but the rocm/pytorch image does not ship. +echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] \ +https://repo.radeon.com/graphics/7.2.3/ubuntu jammy main" \ + > /etc/apt/sources.list.d/amdgpu-graphics.list + +apt-get update +apt-get install -y --no-install-recommends \ + rocdecode rocpydecode rocdecode-host \ + libva-amdgpu-drm2 mesa-amdgpu-va-drivers + +# rocDecode Python bindings ship in /opt/rocm/lib. +export PYTHONPATH=/opt/rocm/lib + +pip install -r requirements.txt +python3 prepare_model.py # exports YOLO26s ONNX, compiles to model.mxr +``` + +## Run + +```bash +python3 main.py --decoder rocdecode --input data/peloton_sample_ai_gen.mp4 --output output.mp4 +python3 main.py --decoder opencv --input data/peloton_sample_ai_gen.mp4 --output output_cv.mp4 +``` + +`main.py --help` lists every flag. The script prints average per-frame +`predict()` and full-pipeline latencies on exit. diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/data/peloton_sample_ai_gen.mp4 b/AI/MIGraphX/gpu_resident_yolo26_pipeline/data/peloton_sample_ai_gen.mp4 new file mode 100644 index 000000000..cfc8249f3 Binary files /dev/null and b/AI/MIGraphX/gpu_resident_yolo26_pipeline/data/peloton_sample_ai_gen.mp4 differ diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/images/result_boxes.jpg b/AI/MIGraphX/gpu_resident_yolo26_pipeline/images/result_boxes.jpg new file mode 100644 index 000000000..80368a778 Binary files /dev/null and b/AI/MIGraphX/gpu_resident_yolo26_pipeline/images/result_boxes.jpg differ diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/main.py b/AI/MIGraphX/gpu_resident_yolo26_pipeline/main.py new file mode 100644 index 000000000..55f5a3576 --- /dev/null +++ b/AI/MIGraphX/gpu_resident_yolo26_pipeline/main.py @@ -0,0 +1,435 @@ +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT + +"""GPU-resident YOLO video inference on AMD GPUs. + +From decode to detection every frame stays in VRAM; only the surviving +boxes (and, for the demo, the raw RGB frame used to draw the output MP4) +cross the PCIe bus. Two decoder paths are provided: + +* ``rocdecode`` pulls a zero-copy DLPack view out of rocPyDecode and feeds + it into MIGraphX on the active HIP stream. +* ``opencv`` decodes on the CPU and serves as a same-machine baseline. +""" + +from __future__ import annotations + +import argparse +import time +from dataclasses import dataclass +from pathlib import Path + +import cv2 +import migraphx +import numpy as np +import torch +import torch.nn.functional as F +from rich.progress import track + + +# ============================================================================ +# Constants +# ============================================================================ + +TARGET_INPUT_SIZE = 640 # YOLO26 expects square 640x640 input +LETTERBOX_FILL_NORM = 114.0 / 255.0 # YOLO letterbox padding value (RGB, normalised) +DEFAULT_DEVICE_ID = 0 # GPU index passed to rocDecode +MP4_FOURCC = "mp4v" # cv2.VideoWriter codec +ROC_RGB_FORMAT_RGB = 3 # pyRocVideoDecode rgb_format enum: RGB packed + +# Approximate CCIR 601 luma weights (B, G, R) used to pick black/white text on a coloured background. +LUMA_WEIGHTS_BGR = (0.114, 0.587, 0.299) +LUMA_TEXT_DARK_THRESHOLD = 140 + +# Deterministic per-class colour palette (BGR); same class keeps the same tint across frames. +PALETTE_BGR = [ + (255, 119, 46), (180, 119, 31), (14, 127, 255), (44, 160, 44), + (40, 39, 214), (189, 103, 148), (75, 86, 140), (127, 127, 127), + (34, 189, 188), (207, 190, 23), (232, 176, 174), (120, 187, 255), + (150, 218, 152), (148, 156, 255), (156, 158, 199), (207, 199, 196), + (219, 219, 197), (229, 218, 158), (165, 214, 197), (154, 204, 219), +] + + +@dataclass(frozen=True) +class Detection: + """One YOLO detection: class id/name, confidence, and ``(x, y, w, h)`` box in pixels.""" + + class_id: int + class_name: str + confidence: float + box: tuple[int, int, int, int] + + +@dataclass(frozen=True) +class VideoInfo: + """Probed video metadata: fps, frame size, and total frame count.""" + + fps: int + width: int + height: int + total_frames: int + + def __str__(self) -> str: + return f"{self.width}x{self.height} @ {self.fps}fps, {self.total_frames} frames" + + +@dataclass +class Timings: + """Per-frame timing accumulator: predict() and full decode+predict pipeline.""" + + frames: int = 0 + predict_s: float = 0.0 # Step 2-4 inside Detector.detect_on_gpu() + pipeline_s: float = 0.0 # decode + predict (host-side draw/write excluded) + + def report(self, pipeline_label: str, output_path: str | Path) -> None: + """Print average ms/frame and fps for predict() and the full pipeline.""" + if self.frames == 0: + return + predict_ms = self.predict_s / self.frames * 1000 + pipeline_ms = self.pipeline_s / self.frames * 1000 + print(f"\n{'=' * 60}") + print(f"Processing complete! Total frames: {self.frames}") + print(f"Average predict(): {predict_ms:.2f} ms ({self.frames / self.predict_s:.1f} fps)") + print(f"Average {pipeline_label}: {pipeline_ms:.2f} ms ({self.frames / self.pipeline_s:.1f} fps)") + print(f"Output saved to: {output_path}") + print(f"{'=' * 60}\n") + + +# ============================================================================ +# Detector: model state + Step 2/3/4 helpers +# ============================================================================ + +class Detector: + """GPU-resident YOLO26 detector backed by a compiled MIGraphX .mxr model. + + Encapsulates the model state and the Step 2 (preprocess), Step 3 (inference), + and Step 4 (postprocess) operations of the pipeline. + """ + + COCO_CLASSES = [ + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", + "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", + "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", + "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", + "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", + "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", + "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", + "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", + "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", + ] + + def __init__(self, model_path: str | Path, conf_threshold: float = 0.25): + """Load the compiled MIGraphX model and pre-allocate its GPU output buffer.""" + self.model = migraphx.load(str(model_path)) + self.classes = self.COCO_CLASSES + self.conf_threshold = conf_threshold + param_shapes = self.model.get_parameter_shapes() + self.input_name = "images" + self.output_name = next(name for name in param_shapes if name != self.input_name) + self.input_shape = param_shapes[self.input_name] + self.output_shape = param_shapes[self.output_name] + self.output_tensor = torch.empty_strided( + self.output_shape.lens(), self.output_shape.strides(), dtype=torch.float32, device="cuda" + ) + self.mgx_output_arg = migraphx.argument_from_pointer( + self.output_shape, self.output_tensor.data_ptr() + ) + + # --- Step 2: Preprocessing ------------------------------------------------ + def preprocess_color_layout(self, rgb_tensor: torch.Tensor) -> torch.Tensor: + """Convert HWC uint8 RGB to BCHW float32 in [0, 1].""" + return rgb_tensor.permute(2, 0, 1).unsqueeze(0) / 255.0 + + @staticmethod + def letterbox_geometry(h: int, w: int, target: int = TARGET_INPUT_SIZE) -> tuple[float, int, int]: + """Return uniform scale and symmetric letterbox padding for *target*x*target*.""" + scale = min(target / w, target / h) + pad_x = (target - int(w * scale)) // 2 + pad_y = (target - int(h * scale)) // 2 + return scale, pad_x, pad_y + + def preprocess_spatial(self, tensor: torch.Tensor, target: int = TARGET_INPUT_SIZE) -> torch.Tensor: + """Resize to scaled size and letterbox-pad to *target*x*target*.""" + h, w = tensor.shape[2], tensor.shape[3] + scale, pad_x, pad_y = self.letterbox_geometry(h, w, target) + new_h, new_w = int(h * scale), int(w * scale) + tensor = F.interpolate(tensor, size=(new_h, new_w), mode="bilinear", align_corners=False) + padding = (pad_x, target - new_w - pad_x, pad_y, target - new_h - pad_y) + return F.pad(tensor, padding, value=LETTERBOX_FILL_NORM).contiguous() + + # --- Step 3: Inference ---------------------------------------------------- + def run_inference(self, input_tensor: torch.Tensor) -> torch.Tensor: + """Enqueue MIGraphX inference on the active PyTorch stream.""" + curr_stream = torch.cuda.current_stream() + mgx_buffers = { + self.input_name: migraphx.argument_from_pointer(self.input_shape, input_tensor.data_ptr()), + self.output_name: self.mgx_output_arg, + } + self.model.run_async(mgx_buffers, curr_stream.cuda_stream, "ihipStream_t") + return self.output_tensor + + # --- Step 4: Postprocessing ---------------------------------------------- + @staticmethod + def filter_predictions(raw: torch.Tensor, conf_thresh: float) -> torch.Tensor: + """Keep predictions above *conf_thresh* (returns [N, 6]).""" + preds = raw[0] + mask = preds[:, 4] > conf_thresh + return preds[mask].clone() + + @staticmethod + def transform_coordinates( + survivors: torch.Tensor, scale: float, pad_x: int, pad_y: int + ) -> torch.Tensor: + """Map boxes from letterboxed model space back to the source frame.""" + survivors[:, [0, 2]] = (survivors[:, [0, 2]] - pad_x) / scale + survivors[:, [1, 3]] = (survivors[:, [1, 3]] - pad_y) / scale + return survivors + + # --- Pipeline ------------------------------------------------------------- + def detect_on_gpu(self, rgb_tensor: torch.Tensor | np.ndarray) -> list[Detection]: + """Run Step 2 to 4 on a GPU-resident RGB frame and return detections. + + *rgb_tensor* may be either a [H, W, 3] uint8 CUDA tensor (rocDecode path) + or a host numpy array (OpenCV baseline); the latter is uploaded to GPU once. + """ + if not isinstance(rgb_tensor, torch.Tensor): + rgb_tensor = torch.from_numpy(rgb_tensor).cuda() + + # Step 2: preprocess on GPU. + assert rgb_tensor.is_cuda, "decoder returned a CPU tensor" + chw = self.preprocess_color_layout(rgb_tensor) + scale, pad_x, pad_y = self.letterbox_geometry(chw.shape[2], chw.shape[3]) + model_input = self.preprocess_spatial(chw) + + # Step 3: inference -> [1, 300, 6] = [x1, y1, x2, y2, conf, class_id]. + raw = self.run_inference(model_input) + + # Step 4: postprocess on GPU; survivors copied to host in one batched DtoH. + survivors = self.filter_predictions(raw, conf_thresh=self.conf_threshold) + survivors = self.transform_coordinates(survivors, scale, pad_x, pad_y) + + # Wait for run_async to finish before reading the output buffer. + torch.cuda.current_stream().synchronize() + host = survivors.cpu().numpy() + + detections: list[Detection] = [] + for x1, y1, x2, y2, conf, cid in host: + cid = int(cid) + class_name = self.classes[cid] if cid < len(self.classes) else f"class_{cid}" + detections.append(Detection( + class_id=cid, + class_name=class_name, + confidence=float(conf), + box=(int(x1), int(y1), int(x2 - x1), int(y2 - y1)), + )) + return detections + + +# ============================================================================ +# Drawing, DLPack shim, video writer, stats +# ============================================================================ + +def draw_detections(frame: np.ndarray, detections: list[Detection]) -> None: + """Draw bounding boxes and confidence labels onto *frame* in-place (BGR).""" + font = cv2.FONT_HERSHEY_SIMPLEX + wb, wg, wr = LUMA_WEIGHTS_BGR + for det in detections: + x, y, w, h = det.box + colour = PALETTE_BGR[det.class_id % len(PALETTE_BGR)] + label = f"{det.class_name} {det.confidence:.2f}" + cv2.rectangle(frame, (x, y), (x + w, y + h), colour, 2) + (lw, lh), bl = cv2.getTextSize(label, font, 0.5, 1) + b, g, r = colour + text_colour = (20, 20, 20) if wb * b + wg * g + wr * r > LUMA_TEXT_DARK_THRESHOLD else (255, 255, 255) + cv2.rectangle(frame, (x, y - lh - bl - 6), (x + lw + 4, y), colour, -1) + cv2.putText(frame, label, (x + 2, y - bl - 2), font, 0.5, text_colour, 1, cv2.LINE_AA) + + +def decoded_rgb_view(packet) -> torch.Tensor: + """Wrap rocPyDecode's RGB surface as a correctly-strided [H-1, W, 3] PyTorch view. + + Workaround for a rocPyDecode 0.8.0 bug: the DLPack capsule advertises strides + ``(W*3, 1, 0)`` for an ``[H, W, 3]`` shape (collapsing R/G/B onto one byte). + The buffer itself is packed HWC RGB but is one row short of ``H*W*3``; we + rebuild the strides as ``(W*3, 3, 1)`` and clamp height to ``H-1``. + """ + raw = torch.from_dlpack(packet.ext_buf[0]) + H, W = raw.shape[:2] + return raw.as_strided((H - 1, W, 3), (W * 3, 3, 1)) + + +def _open_video(path: str | Path) -> tuple[cv2.VideoCapture, VideoInfo]: + """Open *path* with OpenCV and return the capture together with its metadata. + + Raises ``ValueError`` if the video cannot be opened. The caller owns the + returned capture and is responsible for releasing it. + """ + cap = cv2.VideoCapture(str(path)) + if not cap.isOpened(): + raise ValueError(f"Cannot open video: {path}") + info = VideoInfo( + fps=int(cap.get(cv2.CAP_PROP_FPS)), + width=int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), + height=int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), + total_frames=int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), + ) + return cap, info + + +def _make_writer(path: str | Path, fps: int, width: int, height: int) -> cv2.VideoWriter: + """Create an MP4 writer at *path* for ``(width, height)`` frames at *fps*.""" + return cv2.VideoWriter(str(path), cv2.VideoWriter_fourcc(*MP4_FOURCC), fps, (width, height)) + + +# ============================================================================ +# Run modes +# ============================================================================ + +def process_video_rocdecode( + detector: Detector, input_path: str | Path, output_path: str | Path +) -> None: + """Run the full GPU pipeline: VCN decode -> DLPack -> Detector -> draw -> encode.""" + try: + import pyRocVideoDecode.decoder as rocdec + import pyRocVideoDecode.demuxer as rocdmx + import pyRocVideoDecode.types as rocdectypes + except Exception as e: + raise RuntimeError( + "pyRocVideoDecode is not available. Ensure rocdecode/rocPyDecode are installed and set PYTHONPATH=/opt/rocm/lib" + ) from e + + cap, info = _open_video(input_path) + cap.release() + print(f"\nVideo info: {info}") + + writer: cv2.VideoWriter | None = None # created lazily on first decoded frame + + demux = rocdmx.demuxer(str(input_path)) + codec_id = rocdec.GetRocDecCodecID(demux.GetCodecId()) + + viddec = rocdec.decoder( + codec_id, + device_id=DEFAULT_DEVICE_ID, + mem_type=rocdectypes.OUT_SURFACE_MEM_DEV_COPIED, + b_force_zero_latency=False, + crop_rect=None, + max_width=0, + max_height=0, + clk_rate=1000, + ) + + if not viddec.IsCodecSupported(DEFAULT_DEVICE_ID, codec_id, demux.GetBitDepth()): + cfg = viddec.GetGpuInfo() + raise RuntimeError("Codec is not supported on this GPU " + cfg.device_name) + + print("Decoding started, please wait...") + + timings = Timings() + + while True: + # Step 1: Decoding: VCN demuxes and decodes the compressed bitstream on-chip + demux_decode_start = time.perf_counter() + packet = demux.DemuxFrame() + n_frame_returned = viddec.DecodeFrame(packet) + timings.pipeline_s += time.perf_counter() - demux_decode_start + + for _ in range(n_frame_returned): # one packet may yield 0-N frames + frame_process_start = time.perf_counter() + pts = viddec.GetFrameRgb(packet, rgb_format=ROC_RGB_FORMAT_RGB) # NV12->RGB on the GPU via HIP kernel + if pts == -1: + viddec.ReleaseFrame(packet) + continue + + rgb_tensor = decoded_rgb_view(packet) # [H, W, 3] uint8, cuda + + # Steps 2-4: preprocess (resize+letterbox) -> MIGraphX inference -> filter+remap, all on GPU. + predict_start = time.perf_counter() + detections = detector.detect_on_gpu(rgb_tensor) + timings.predict_s += time.perf_counter() - predict_start + timings.pipeline_s += time.perf_counter() - frame_process_start + timings.frames += 1 + + # Demo-only host-side work (raw-frame DtoH, draw, encode); excluded from pipeline_s. + frame = cv2.cvtColor(rgb_tensor.cpu().numpy(), cv2.COLOR_RGB2BGR) + draw_detections(frame, detections) + if writer is None: + fh, fw = frame.shape[:2] + writer = _make_writer(output_path, info.fps, fw, fh) + writer.write(frame) + + viddec.ReleaseFrame(packet) + + if packet.bitstream_size <= 0: # end-of-stream + break + + if writer is not None: + writer.release() + + timings.report("pipeline (demux+decode+predict)", output_path) + + +def process_video_opencv( + detector: Detector, input_path: str | Path, output_path: str | Path +) -> None: + """Run the OpenCV CPU-decoding baseline; Step 2 to 4 are identical to the GPU path.""" + cap, info = _open_video(input_path) + print(f"\nVideo info: {info}") + + writer = _make_writer(output_path, info.fps, info.width, info.height) + timings = Timings() + + for _ in track(range(info.total_frames), description="Processing video..."): + # Step 1: Decoding on CPU (OpenCV baseline) + frame_process_start = time.perf_counter() + ret, frame = cap.read() # BGR + if not ret: + break + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + + # Steps 2-4: preprocess (resize+letterbox) -> MIGraphX inference -> filter+remap, all on GPU. + predict_start = time.perf_counter() + detections = detector.detect_on_gpu(rgb) + timings.predict_s += time.perf_counter() - predict_start + timings.pipeline_s += time.perf_counter() - frame_process_start + timings.frames += 1 + + draw_detections(frame, detections) + writer.write(frame) + + cap.release() + writer.release() + timings.report("pipeline (read+predict)", output_path) + + +# ============================================================================ +# CLI +# ============================================================================ + +def main() -> None: + """Parse CLI arguments, build the detector, and run the selected decoder path.""" + p = argparse.ArgumentParser( + description="GPU-resident YOLO video inference on AMD GPUs", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument("--decoder", default="rocdecode", choices=["opencv", "rocdecode"], help="Video decoder backend") + p.add_argument("--model", default="model.mxr", type=Path, help="MIGraphX .mxr model path") + p.add_argument("--input", default="data/peloton_sample_ai_gen.mp4", type=Path, help="Input video path (MP4/MKV; H.264/H.265 for rocdecode)") + p.add_argument("--output", default="output.mp4", type=Path, help="Output video path") + p.add_argument("--conf-threshold", default=0.25, type=float, help="Detection confidence threshold") + args = p.parse_args() + + print(f"Loading model from: {args.model}") + detector = Detector(model_path=args.model, conf_threshold=args.conf_threshold) + if args.decoder == "rocdecode": + process_video_rocdecode(detector, args.input, args.output) + else: + process_video_opencv(detector, args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/prepare_model.py b/AI/MIGraphX/gpu_resident_yolo26_pipeline/prepare_model.py new file mode 100644 index 000000000..9a0d991ef --- /dev/null +++ b/AI/MIGraphX/gpu_resident_yolo26_pipeline/prepare_model.py @@ -0,0 +1,39 @@ +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: MIT + +"""Export YOLO26s to ONNX and compile it for MIGraphX. + +The export step must run in a separate interpreter from the compile step: +Ultralytics' ONNX export sets ``CUDA_VISIBLE_DEVICES=''`` to force a CPU +trace, and HIP caches the device list at first use, so a subsequent +``migraphx.get_target("gpu")`` in the same process raises +``get_device_id: No device``. The subprocess fence below isolates that +side effect; the compile step then runs against a clean HIP environment. +""" + +import subprocess +import sys + +# --- Step 0a: Export YOLO26s to ONNX +subprocess.check_call([ + sys.executable, "-c", + "from ultralytics import YOLO; " + "YOLO('yolo26s.pt').export(format='onnx', dynamic=False, batch=1, imgsz=640)", +]) + +# --- Step 0b: Compile ONNX -> MIGraphX .mxr (article snippet, verbatim) +import migraphx # noqa: E402 (HIP must initialize after the export subprocess; see module docstring) + +model = migraphx.parse_onnx("yolo26s.onnx") + +# FP16 quantization speeds up model inference without accuracy drop +migraphx.quantize_fp16(model) + +# offload_copy=False exposes the output as a named parameter so we can bind +# a pre-allocated PyTorch tensor to it at inference time (see Step 3). +model.compile(migraphx.get_target("gpu"), offload_copy=False) + +migraphx.save(model, "model.mxr") diff --git a/AI/MIGraphX/gpu_resident_yolo26_pipeline/requirements.txt b/AI/MIGraphX/gpu_resident_yolo26_pipeline/requirements.txt new file mode 100644 index 000000000..5e5c17be5 --- /dev/null +++ b/AI/MIGraphX/gpu_resident_yolo26_pipeline/requirements.txt @@ -0,0 +1,11 @@ +# Pinned versions used by the blog sample. PyTorch, MIGraphX, and rocDecode +# come from the rocm/pytorch:rocm7.2.2_ubuntu22.04_py3.10_pytorch_release_2.10.0 +# container and are not installed via pip here. + +opencv-python-headless==4.10.0.84 +numpy==1.26.4 +rich==13.7.1 + +# Required only by prepare_model.py (ONNX export). +ultralytics==8.4.41 +onnx==1.16.2