From 7f646361d8e83a15c03f1923ae9a3ae0184fa0a9 Mon Sep 17 00:00:00 2001
From: SpyrosMouselinos <mouselinos.spur.kw@gmail.com>
Date: Mon, 20 Oct 2025 18:39:50 +0200
Subject: [PATCH 1/5] Disconnect Init Code

---
 Dockerfile                         |   9 +
 README.md                          |  78 +++
 core/gpu_disconnect.py             | 544 ++++++++++++++++
 core/handlers.py                   | 111 +++-
 core/hub_handlers.py               | 252 +++++++-
 docker-compose.yml                 |   5 +
 requirements.txt                   |   3 +-
 static/css/disconnect-controls.css | 613 ++++++++++++++++++
 static/js/gpu-cards.js             |  21 +-
 static/js/gpu-disconnect.js        | 963 +++++++++++++++++++++++++++++
 templates/index.html               |   4 +-
 11 files changed, 2596 insertions(+), 7 deletions(-)
 create mode 100644 core/gpu_disconnect.py
 create mode 100644 static/css/disconnect-controls.css
 create mode 100644 static/js/gpu-disconnect.js

diff --git a/Dockerfile b/Dockerfile
index ea08b2a..38cd5c3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,14 @@
 FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
 
+# GPU Hot - Real-time NVIDIA GPU Monitoring with Disconnect Testing
+# 
+# IMPORTANT: For GPU disconnect functionality, this container requires:
+# - privileged: true (to access PCI sysfs)
+# - volumes: /sys/bus/pci:/sys/bus/pci:rw (for PCI operations)
+# - volumes: /sys/devices:/sys/devices:ro (for device enumeration)
+# 
+# See docker-compose.yml for complete configuration example
+
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
diff --git a/README.md b/README.md
index 41693c8..758c55e 100644
--- a/README.md
+++ b/README.md
@@ -58,11 +58,76 @@ docker-compose up --build
 - Historical charts (utilization, temperature, power, clocks)
 - System metrics (CPU, RAM)
 - Scale from 1 to 100+ GPUs
+- **GPU Disconnect Testing** - Simulate GPU failures for fault tolerance testing
 
 **Metrics:** Utilization, temperature, memory, power draw, fan speed, clock speeds, PCIe info, P-State, throttle status, encoder/decoder sessions
 
 ---
 
+## GPU Disconnect Testing
+
+GPU Hot includes advanced fault tolerance testing through simulated GPU disconnect/reconnect operations. This feature helps test how your applications handle GPU failures in production environments.
+
+### Features
+- **Multiple disconnect methods** - Auto-select the most realistic method available:
+  - **Slot Power Toggle** - Actually cut and restore slot power (closest to physical disconnect)
+  - **Hot Reset** - Reset PCIe link using upstream bridge controls  
+  - **Logical Remove** - Software remove and re-scan (no hardware reset)
+  - **NVIDIA Reset** - Use NVIDIA driver reset functionality
+- **Individual GPU control** - Disconnect specific GPUs from detailed view
+- **Multi-GPU operations** - Select and disconnect multiple GPUs simultaneously
+- **Hub coordination** - Hub can trigger disconnects on remote nodes
+- **Real-time feedback** - Live status updates during operations
+- **Safety features** - Process detection, confirmation dialogs, timeout protection
+
+### Requirements
+
+**For GPU disconnect functionality, the container requires elevated privileges:**
+```bash
+# Docker run with privileged mode
+docker run -d --gpus all --privileged \
+  -v /sys/bus/pci:/sys/bus/pci:rw \
+  -v /sys/devices:/sys/devices:ro \
+  -p 1312:1312 ghcr.io/psalias2006/gpu-hot:latest
+```
+
+**Or use docker-compose (recommended):**
+```bash
+# docker-compose.yml includes the required privileged configuration
+docker-compose up -d
+```
+
+### Usage
+
+1. **Individual GPU**: Click the "Disconnect" button in any GPU's detailed view
+2. **Multiple GPUs**: 
+   - Select GPUs using checkboxes in overview tab
+   - Click "Disconnect Selected" from the batch toolbar
+3. **Choose method** and duration in the modal dialog
+4. **Monitor progress** with real-time status updates
+
+### Security & Safety
+
+⚠️ **Important Considerations:**
+- Requires **root privileges** inside container (privileged mode)
+- Will **interrupt running processes** on affected GPUs
+- Includes **confirmation dialogs** and active process warnings
+- All operations are **logged** for audit trails
+- **Rate limiting** prevents abuse
+- Works on **dedicated GPU slots** (avoid shared PCIe buses)
+
+### Hub Mode
+The hub can coordinate disconnect operations across multiple nodes:
+```bash
+# Hub triggers disconnect on specific node
+POST /api/hub/gpu/{node_name}/{gpu_id}/disconnect
+
+# Multi-node batch operations supported
+POST /api/hub/gpu/disconnect-multiple
+```
+
+---
+
 ## Configuration
 
 **Environment variables:**
@@ -88,6 +153,19 @@ PORT = 1312            # Server port
 ```bash
 GET /              # Dashboard
 GET /api/gpu-data  # JSON metrics
+
+# GPU Disconnect API (Node Mode)
+GET  /api/gpu/{gpu_id}/disconnect/methods        # Get available disconnect methods
+POST /api/gpu/{gpu_id}/disconnect               # Disconnect specific GPU
+POST /api/gpu/disconnect-multiple               # Disconnect multiple GPUs
+GET  /api/gpu/disconnect/status                 # System disconnect capabilities
+
+# GPU Disconnect API (Hub Mode)
+GET  /api/hub/nodes                             # List connected nodes
+GET  /api/hub/gpu/{node}/{gpu_id}/disconnect/methods  # Get methods for node GPU
+POST /api/hub/gpu/{node}/{gpu_id}/disconnect   # Disconnect GPU on specific node
+POST /api/hub/gpu/disconnect-multiple          # Multi-node batch disconnect
+GET  /api/hub/gpu/disconnect/status             # Hub-wide disconnect status
 ```
 
 ### WebSocket
diff --git a/core/gpu_disconnect.py b/core/gpu_disconnect.py
new file mode 100644
index 0000000..29f3457
--- /dev/null
+++ b/core/gpu_disconnect.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python3
+"""
+GPU Disconnect/Reconnect Utility for GPU Hot
+Simulates GPU disconnect/reconnect on Linux for fault tolerance testing
+"""
+
+import asyncio
+import os
+import subprocess
+import logging
+import time
+from pathlib import Path
+from typing import Optional, Dict, List
+from enum import Enum
+
+logger = logging.getLogger(__name__)
+
+SYSFS_PCI_DEVICES = Path("/sys/bus/pci/devices")
+SYSFS_PCI_SLOTS = Path("/sys/bus/pci/slots")
+SYSFS_PCI_RESCAN = Path("/sys/bus/pci/rescan")
+
+
+class DisconnectMethod(Enum):
+    """Available GPU disconnect methods"""
+    AUTO = "auto"
+    SLOT_POWER = "slot"
+    HOT_RESET = "hot"
+    LOGICAL = "logical"
+    NVIDIA_RESET = "nvidia"
+
+
+class GPUDisconnectError(Exception):
+    """Custom exception for GPU disconnect operations"""
+    pass
+
+
+class GPUDisconnector:
+    """Manages GPU disconnect/reconnect operations"""
+
+    def __init__(self):
+        self._check_root_permissions()
+
+    def _check_root_permissions(self):
+        """Check if running with sufficient privileges"""
+        if os.geteuid() != 0:
+            logger.warning("GPU disconnect requires root privileges. Operations may fail.")
+
+    async def disconnect_gpu(
+        self, 
+        gpu_index: int, 
+        method: DisconnectMethod = DisconnectMethod.AUTO,
+        down_time: float = 5.0
+    ) -> Dict[str, any]:
+        """
+        Disconnect and reconnect a GPU
+        
+        Args:
+            gpu_index: NVIDIA GPU index (0-based)
+            method: Disconnect method to use
+            down_time: Seconds to keep device disconnected
+            
+        Returns:
+            Dict with operation results
+        """
+        try:
+            # Get GPU PCI bus ID
+            bdf = await self._get_gpu_bdf(gpu_index)
+            logger.info(f"Disconnecting GPU {gpu_index} (PCI: {bdf}) using method: {method.value}")
+            
+            # Check for active processes
+            processes = await self._check_gpu_processes(gpu_index)
+            if processes:
+                logger.warning(f"GPU {gpu_index} has {len(processes)} active processes")
+            
+            # Perform disconnect/reconnect
+            result = await self._execute_disconnect(bdf, method, down_time)
+            result.update({
+                'gpu_index': gpu_index,
+                'bdf': bdf,
+                'method_used': method.value,
+                'down_time': down_time,
+                'active_processes': len(processes)
+            })
+            
+            logger.info(f"GPU {gpu_index} disconnect/reconnect completed successfully")
+            return result
+            
+        except Exception as e:
+            error_msg = f"Failed to disconnect GPU {gpu_index}: {str(e)}"
+            logger.error(error_msg)
+            raise GPUDisconnectError(error_msg) from e
+
+    async def disconnect_multiple_gpus(
+        self,
+        gpu_indices: List[int],
+        method: DisconnectMethod = DisconnectMethod.AUTO,
+        down_time: float = 5.0
+    ) -> Dict[str, any]:
+        """
+        Disconnect multiple GPUs simultaneously
+        
+        Args:
+            gpu_indices: List of GPU indices to disconnect
+            method: Disconnect method to use
+            down_time: Seconds to keep devices disconnected
+            
+        Returns:
+            Dict with results for each GPU
+        """
+        logger.info(f"Disconnecting {len(gpu_indices)} GPUs: {gpu_indices}")
+        
+        # Create tasks for each GPU
+        tasks = []
+        for gpu_index in gpu_indices:
+            task = asyncio.create_task(
+                self.disconnect_gpu(gpu_index, method, down_time),
+                name=f"disconnect_gpu_{gpu_index}"
+            )
+            tasks.append((gpu_index, task))
+        
+        # Wait for all operations to complete
+        results = {}
+        errors = {}
+        
+        for gpu_index, task in tasks:
+            try:
+                results[gpu_index] = await task
+            except Exception as e:
+                errors[gpu_index] = str(e)
+                logger.error(f"GPU {gpu_index} disconnect failed: {e}")
+        
+        return {
+            'total_gpus': len(gpu_indices),
+            'successful': len(results),
+            'failed': len(errors),
+            'results': results,
+            'errors': errors
+        }
+
+    async def get_available_methods(self, gpu_index: int) -> List[str]:
+        """Get available disconnect methods for a GPU"""
+        methods = []
+        
+        try:
+            bdf = await self._get_gpu_bdf(gpu_index)
+            
+            # Check slot power
+            if self._has_slot_power(bdf):
+                methods.append(DisconnectMethod.SLOT_POWER.value)
+            
+            # Check hot reset capability
+            if self._has_hot_reset_capability(bdf):
+                methods.append(DisconnectMethod.HOT_RESET.value)
+            
+            # Logical remove always available
+            methods.append(DisconnectMethod.LOGICAL.value)
+            
+            # NVIDIA reset (if nvidia-smi available)
+            if await self._has_nvidia_smi():
+                methods.append(DisconnectMethod.NVIDIA_RESET.value)
+                
+        except Exception as e:
+            logger.error(f"Error checking methods for GPU {gpu_index}: {e}")
+        
+        return methods
+
+    async def _get_gpu_bdf(self, gpu_index: int) -> str:
+        """Get PCI bus ID for GPU index using nvidia-smi"""
+        try:
+            result = await asyncio.create_subprocess_exec(
+                'nvidia-smi', '--query-gpu=pci.bus_id', '--format=csv,noheader', '-i', str(gpu_index),
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await result.communicate()
+            
+            if result.returncode != 0:
+                raise GPUDisconnectError(f"nvidia-smi failed: {stderr.decode()}")
+            
+            bdf = stdout.decode().strip()
+            if bdf.startswith("00000000:"):
+                bdf = "0000:" + bdf.split(":", 1)[1]
+            
+            return bdf
+            
+        except Exception as e:
+            raise GPUDisconnectError(f"Failed to get PCI bus ID for GPU {gpu_index}: {e}")
+
+    async def _check_gpu_processes(self, gpu_index: int) -> List[Dict]:
+        """Check for active processes on GPU"""
+        try:
+            result = await asyncio.create_subprocess_exec(
+                'nvidia-smi', '--query-compute-apps=pid,process_name', '--format=csv,noheader', '-i', str(gpu_index),
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await result.communicate()
+            
+            if result.returncode != 0:
+                return []
+            
+            processes = []
+            for line in stdout.decode().strip().splitlines():
+                if line.strip() and "No running processes found" not in line:
+                    parts = line.split(',', 1)
+                    if len(parts) == 2:
+                        processes.append({
+                            'pid': parts[0].strip(),
+                            'name': parts[1].strip()
+                        })
+            
+            return processes
+            
+        except Exception:
+            return []
+
+    async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_time: float) -> Dict:
+        """Execute the actual disconnect/reconnect operation"""
+        if method == DisconnectMethod.AUTO:
+            method = await self._select_best_method(bdf)
+        
+        start_time = time.time()
+        
+        try:
+            if method == DisconnectMethod.SLOT_POWER:
+                await self._slot_power_disconnect(bdf, down_time)
+            elif method == DisconnectMethod.HOT_RESET:
+                await self._hot_reset_disconnect(bdf, down_time)
+            elif method == DisconnectMethod.LOGICAL:
+                await self._logical_disconnect(bdf, down_time)
+            elif method == DisconnectMethod.NVIDIA_RESET:
+                await self._nvidia_reset_disconnect(bdf, down_time)
+            else:
+                raise GPUDisconnectError(f"Unsupported method: {method}")
+            
+            duration = time.time() - start_time
+            return {
+                'success': True,
+                'method_executed': method.value,
+                'duration_seconds': duration,
+                'message': f"Successfully completed {method.value} disconnect/reconnect"
+            }
+            
+        except Exception as e:
+            duration = time.time() - start_time
+            return {
+                'success': False,
+                'method_executed': method.value,
+                'duration_seconds': duration,
+                'error': str(e)
+            }
+
+    async def _select_best_method(self, bdf: str) -> DisconnectMethod:
+        """Select the best available method for maximum realism"""
+        if self._has_slot_power(bdf):
+            return DisconnectMethod.SLOT_POWER
+        elif self._has_hot_reset_capability(bdf):
+            return DisconnectMethod.HOT_RESET
+        else:
+            return DisconnectMethod.LOGICAL
+
+    def _has_slot_power(self, bdf: str) -> bool:
+        """Check if slot power control is available"""
+        try:
+            dev = SYSFS_PCI_DEVICES / bdf
+            if not dev.exists():
+                return False
+            
+            # Check for slot symlink
+            slot_link = dev / "slot"
+            if slot_link.exists():
+                power_file = slot_link / "power"
+                return power_file.exists()
+            
+            # Check slots directory
+            if SYSFS_PCI_SLOTS.exists():
+                target = bdf.split(".")[0]  # Remove function
+                for slot in SYSFS_PCI_SLOTS.iterdir():
+                    addr_file = slot / "address"
+                    power_file = slot / "power"
+                    if addr_file.exists() and power_file.exists():
+                        try:
+                            addr = addr_file.read_text().strip()
+                            if addr == target:
+                                return True
+                        except Exception:
+                            continue
+            
+            return False
+            
+        except Exception:
+            return False
+
+    def _has_hot_reset_capability(self, bdf: str) -> bool:
+        """Check if hot reset is available"""
+        try:
+            # Check for upstream bridge reset capability
+            upstream_bdf = self._get_upstream_bdf(bdf)
+            if upstream_bdf:
+                upstream_dev = SYSFS_PCI_DEVICES / upstream_bdf
+                reset_sub = upstream_dev / "reset_subordinate"
+                reset_file = upstream_dev / "reset"
+                return reset_sub.exists() or reset_file.exists()
+            return False
+        except Exception:
+            return False
+
+    def _get_upstream_bdf(self, bdf: str) -> Optional[str]:
+        """Get upstream bridge BDF"""
+        try:
+            dev_path = SYSFS_PCI_DEVICES / bdf
+            parent = dev_path.resolve().parent.name
+            if ":" in parent:
+                return parent
+            return None
+        except Exception:
+            return None
+
+    async def _has_nvidia_smi(self) -> bool:
+        """Check if nvidia-smi is available"""
+        try:
+            result = await asyncio.create_subprocess_exec(
+                'nvidia-smi', '--version',
+                stdout=asyncio.subprocess.DEVNULL,
+                stderr=asyncio.subprocess.DEVNULL
+            )
+            await result.communicate()
+            return result.returncode == 0
+        except Exception:
+            return False
+
+    async def _slot_power_disconnect(self, bdf: str, down_time: float):
+        """Execute slot power disconnect"""
+        logger.info(f"Executing slot power disconnect for {bdf}")
+        
+        power_file = self._find_slot_power_file(bdf)
+        if not power_file:
+            raise GPUDisconnectError(f"Slot power file not found for {bdf}")
+        
+        # Unbind driver first
+        await self._unbind_driver(bdf)
+        
+        # Power off
+        await self._write_sysfs(power_file, "0")
+        logger.info(f"Slot powered OFF for {down_time}s")
+        
+        # Wait for device to disappear
+        await self._wait_for_condition(
+            lambda: not (SYSFS_PCI_DEVICES / bdf).exists(),
+            timeout=10,
+            description=f"{bdf} to disappear"
+        )
+        
+        await asyncio.sleep(down_time)
+        
+        # Power on
+        await self._write_sysfs(power_file, "1")
+        logger.info("Slot powered ON")
+        
+        # Rescan and rebind
+        await self._write_sysfs(SYSFS_PCI_RESCAN, "1")
+        await self._wait_for_condition(
+            lambda: (SYSFS_PCI_DEVICES / bdf).exists(),
+            timeout=30,
+            description=f"{bdf} to reappear"
+        )
+
+    async def _hot_reset_disconnect(self, bdf: str, down_time: float):
+        """Execute hot reset disconnect"""
+        logger.info(f"Executing hot reset for {bdf}")
+        
+        upstream_bdf = self._get_upstream_bdf(bdf)
+        if not upstream_bdf:
+            raise GPUDisconnectError(f"Cannot find upstream bridge for {bdf}")
+        
+        # Unbind and remove
+        await self._unbind_driver(bdf)
+        await self._write_sysfs(SYSFS_PCI_DEVICES / bdf / "remove", "1")
+        
+        await asyncio.sleep(0.25)
+        
+        # Try hot reset
+        upstream_dev = SYSFS_PCI_DEVICES / upstream_bdf
+        reset_sub = upstream_dev / "reset_subordinate"
+        reset_file = upstream_dev / "reset"
+        
+        if reset_sub.exists():
+            await self._write_sysfs(reset_sub, "1")
+        elif reset_file.exists():
+            await self._write_sysfs(reset_file, "1")
+        else:
+            raise GPUDisconnectError(f"No reset capability found for upstream {upstream_bdf}")
+        
+        await asyncio.sleep(down_time)
+        
+        # Rescan
+        await self._write_sysfs(SYSFS_PCI_RESCAN, "1")
+        await self._wait_for_condition(
+            lambda: (SYSFS_PCI_DEVICES / bdf).exists(),
+            timeout=30,
+            description=f"{bdf} to reappear"
+        )
+
+    async def _logical_disconnect(self, bdf: str, down_time: float):
+        """Execute logical disconnect (remove/rescan)"""
+        logger.info(f"Executing logical disconnect for {bdf}")
+        
+        # Unbind and remove
+        await self._unbind_driver(bdf)
+        await self._write_sysfs(SYSFS_PCI_DEVICES / bdf / "remove", "1")
+        
+        await asyncio.sleep(down_time)
+        
+        # Rescan
+        await self._write_sysfs(SYSFS_PCI_RESCAN, "1")
+        await self._wait_for_condition(
+            lambda: (SYSFS_PCI_DEVICES / bdf).exists(),
+            timeout=30,
+            description=f"{bdf} to reappear"
+        )
+
+    async def _nvidia_reset_disconnect(self, bdf: str, down_time: float):
+        """Execute NVIDIA GPU reset"""
+        logger.info(f"Executing NVIDIA reset for {bdf}")
+        
+        # Find GPU index from BDF
+        gpu_index = await self._get_gpu_index_from_bdf(bdf)
+        
+        result = await asyncio.create_subprocess_exec(
+            'nvidia-smi', '--gpu-reset', '-i', str(gpu_index),
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await result.communicate()
+        
+        if result.returncode != 0:
+            raise GPUDisconnectError(f"nvidia-smi --gpu-reset failed: {stderr.decode()}")
+        
+        await asyncio.sleep(down_time)
+
+    async def _get_gpu_index_from_bdf(self, target_bdf: str) -> int:
+        """Get GPU index from PCI bus ID"""
+        result = await asyncio.create_subprocess_exec(
+            'nvidia-smi', '--query-gpu=index,pci.bus_id', '--format=csv,noheader',
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await result.communicate()
+        
+        if result.returncode != 0:
+            raise GPUDisconnectError(f"Failed to query GPU indices: {stderr.decode()}")
+        
+        for line in stdout.decode().strip().splitlines():
+            parts = line.split(',')
+            if len(parts) >= 2:
+                index = int(parts[0].strip())
+                bdf = parts[1].strip()
+                if bdf.startswith("00000000:"):
+                    bdf = "0000:" + bdf.split(":", 1)[1]
+                if bdf == target_bdf:
+                    return index
+        
+        raise GPUDisconnectError(f"GPU index not found for BDF {target_bdf}")
+
+    def _find_slot_power_file(self, bdf: str) -> Optional[Path]:
+        """Find slot power control file"""
+        dev = SYSFS_PCI_DEVICES / bdf
+        slot_link = dev / "slot"
+        if slot_link.exists():
+            power_file = slot_link / "power"
+            if power_file.exists():
+                return power_file
+        
+        # Check slots directory
+        if SYSFS_PCI_SLOTS.exists():
+            target = bdf.split(".")[0]
+            for slot in SYSFS_PCI_SLOTS.iterdir():
+                addr_file = slot / "address"
+                power_file = slot / "power"
+                if addr_file.exists() and power_file.exists():
+                    try:
+                        addr = addr_file.read_text().strip()
+                        if addr == target:
+                            return power_file
+                    except Exception:
+                        continue
+        
+        return None
+
+    async def _unbind_driver(self, bdf: str):
+        """Unbind driver from device"""
+        try:
+            driver_link = SYSFS_PCI_DEVICES / bdf / "driver"
+            if driver_link.is_symlink():
+                driver_name = driver_link.resolve().name
+                unbind_file = Path(f"/sys/bus/pci/drivers/{driver_name}/unbind")
+                if unbind_file.exists():
+                    await self._write_sysfs(unbind_file, bdf)
+                    logger.debug(f"Unbound driver {driver_name} from {bdf}")
+        except Exception as e:
+            logger.warning(f"Failed to unbind driver for {bdf}: {e}")
+
+    async def _write_sysfs(self, path: Path, value: str):
+        """Write to sysfs file with proper error handling"""
+        try:
+            def write_sync():
+                path.write_text(value)
+            
+            await asyncio.get_event_loop().run_in_executor(None, write_sync)
+            logger.debug(f"Wrote '{value}' to {path}")
+            
+        except Exception as e:
+            raise GPUDisconnectError(f"Failed to write to {path}: {e}")
+
+    async def _wait_for_condition(self, condition, timeout: int, description: str):
+        """Wait for a condition to be true with timeout"""
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            if condition():
+                return
+            await asyncio.sleep(0.25)
+        
+        raise GPUDisconnectError(f"Timeout waiting for {description}")
+
+
+# Global instance
+gpu_disconnector = GPUDisconnector()
+
+
+async def disconnect_gpu(gpu_index: int, method: str = "auto", down_time: float = 5.0) -> Dict:
+    """Async wrapper for GPU disconnect operation"""
+    method_enum = DisconnectMethod(method)
+    return await gpu_disconnector.disconnect_gpu(gpu_index, method_enum, down_time)
+
+
+async def disconnect_multiple_gpus(gpu_indices: List[int], method: str = "auto", down_time: float = 5.0) -> Dict:
+    """Async wrapper for multiple GPU disconnect operation"""
+    method_enum = DisconnectMethod(method)
+    return await gpu_disconnector.disconnect_multiple_gpus(gpu_indices, method_enum, down_time)
+
+
+async def get_available_methods(gpu_index: int) -> List[str]:
+    """Get available disconnect methods for a GPU"""
+    return await gpu_disconnector.get_available_methods(gpu_index)
diff --git a/core/handlers.py b/core/handlers.py
index 070ff30..18f4d50 100644
--- a/core/handlers.py
+++ b/core/handlers.py
@@ -1,18 +1,34 @@
-"""Async WebSocket handlers for real-time monitoring"""
+"""Async WebSocket handlers for real-time monitoring and GPU disconnect API endpoints"""
 
 import asyncio
 import psutil
 import logging
 import json
 from datetime import datetime
-from fastapi import WebSocket
+from fastapi import WebSocket, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
 from . import config
+from .gpu_disconnect import disconnect_gpu, disconnect_multiple_gpus, get_available_methods, GPUDisconnectError
 
 logger = logging.getLogger(__name__)
 
 # Global WebSocket connections
 websocket_connections = set()
 
+
+# Pydantic models for API requests
+class DisconnectRequest(BaseModel):
+    method: str = "auto"
+    down_time: float = 5.0
+
+
+class MultiDisconnectRequest(BaseModel):
+    gpu_indices: list[int]
+    method: str = "auto"
+    down_time: float = 5.0
+
+
 def register_handlers(app, monitor):
     """Register FastAPI WebSocket handlers"""
     
@@ -34,6 +50,97 @@ async def websocket_endpoint(websocket: WebSocket):
             logger.debug(f'Dashboard client disconnected: {e}')
         finally:
             websocket_connections.discard(websocket)
+    
+    # GPU Disconnect API Endpoints
+    @app.get("/api/gpu/{gpu_id}/disconnect/methods")
+    async def get_disconnect_methods(gpu_id: int):
+        """Get available disconnect methods for a GPU"""
+        try:
+            methods = await get_available_methods(gpu_id)
+            return {
+                "gpu_id": gpu_id,
+                "available_methods": methods,
+                "default_method": "auto"
+            }
+        except Exception as e:
+            logger.error(f"Error getting disconnect methods for GPU {gpu_id}: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.post("/api/gpu/{gpu_id}/disconnect")
+    async def disconnect_single_gpu(gpu_id: int, request: DisconnectRequest):
+        """Disconnect and reconnect a specific GPU"""
+        try:
+            logger.info(f"Received disconnect request for GPU {gpu_id}, method: {request.method}, down_time: {request.down_time}s")
+            
+            result = await disconnect_gpu(
+                gpu_index=gpu_id,
+                method=request.method,
+                down_time=request.down_time
+            )
+            
+            return JSONResponse(content=result)
+            
+        except GPUDisconnectError as e:
+            logger.error(f"GPU disconnect error: {e}")
+            raise HTTPException(status_code=400, detail=str(e))
+        except Exception as e:
+            logger.error(f"Unexpected error during GPU {gpu_id} disconnect: {e}")
+            raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
+    
+    @app.post("/api/gpu/disconnect-multiple")
+    async def disconnect_multiple(request: MultiDisconnectRequest):
+        """Disconnect and reconnect multiple GPUs simultaneously"""
+        try:
+            logger.info(f"Received multi-disconnect request for GPUs {request.gpu_indices}, method: {request.method}, down_time: {request.down_time}s")
+            
+            result = await disconnect_multiple_gpus(
+                gpu_indices=request.gpu_indices,
+                method=request.method,
+                down_time=request.down_time
+            )
+            
+            return JSONResponse(content=result)
+            
+        except GPUDisconnectError as e:
+            logger.error(f"Multi-GPU disconnect error: {e}")
+            raise HTTPException(status_code=400, detail=str(e))
+        except Exception as e:
+            logger.error(f"Unexpected error during multi-GPU disconnect: {e}")
+            raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
+    
+    @app.get("/api/gpu/disconnect/status")
+    async def get_disconnect_status():
+        """Get current disconnect operation status and system capabilities"""
+        try:
+            # Check root permissions
+            import os
+            has_root = os.geteuid() == 0
+            
+            # Check nvidia-smi availability
+            import shutil
+            has_nvidia_smi = shutil.which("nvidia-smi") is not None
+            
+            # Check sysfs access
+            from pathlib import Path
+            sysfs_accessible = Path("/sys/bus/pci/devices").exists()
+            
+            return {
+                "ready": has_root and has_nvidia_smi and sysfs_accessible,
+                "permissions": {
+                    "root_access": has_root,
+                    "nvidia_smi_available": has_nvidia_smi,
+                    "sysfs_accessible": sysfs_accessible
+                },
+                "warnings": [
+                    "Root privileges required for PCI operations" if not has_root else None,
+                    "nvidia-smi not found in PATH" if not has_nvidia_smi else None,
+                    "PCI sysfs interface not accessible" if not sysfs_accessible else None
+                ]
+            }
+            
+        except Exception as e:
+            logger.error(f"Error checking disconnect status: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
 
 
 async def monitor_loop(monitor, connections):
diff --git a/core/hub_handlers.py b/core/hub_handlers.py
index 0f02826..5a26dab 100644
--- a/core/hub_handlers.py
+++ b/core/hub_handlers.py
@@ -1,15 +1,58 @@
-"""Async WebSocket handlers for hub mode"""
+"""Async WebSocket handlers for hub mode and GPU disconnect relay endpoints"""
 
 import asyncio
 import logging
 import json
-from fastapi import WebSocket
+import aiohttp
+from fastapi import WebSocket, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from typing import Dict, Any
 
 logger = logging.getLogger(__name__)
 
 # Global WebSocket connections
 websocket_connections = set()
 
+
+# Pydantic models for hub disconnect requests
+class HubDisconnectRequest(BaseModel):
+    method: str = "auto"
+    down_time: float = 5.0
+
+
+class HubMultiDisconnectRequest(BaseModel):
+    targets: list[dict]  # [{"node_name": "node1", "gpu_id": 0}, ...]
+    method: str = "auto"
+    down_time: float = 5.0
+
+
+async def forward_to_node(node_url: str, endpoint: str, method: str = "GET", data: Dict[str, Any] = None) -> Dict[str, Any]:
+    """Forward API request to a specific node"""
+    url = f"{node_url.rstrip('/')}/{endpoint.lstrip('/')}"
+    
+    try:
+        async with aiohttp.ClientSession() as session:
+            if method.upper() == "GET":
+                async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
+                    return await response.json()
+            elif method.upper() == "POST":
+                async with session.post(url, json=data, timeout=aiohttp.ClientTimeout(total=60)) as response:
+                    if response.status >= 400:
+                        error_text = await response.text()
+                        raise Exception(f"Node returned error {response.status}: {error_text}")
+                    return await response.json()
+            else:
+                raise ValueError(f"Unsupported HTTP method: {method}")
+                
+    except asyncio.TimeoutError:
+        raise Exception(f"Timeout connecting to node at {node_url}")
+    except aiohttp.ClientError as e:
+        raise Exception(f"Network error connecting to node at {node_url}: {str(e)}")
+    except Exception as e:
+        raise Exception(f"Error communicating with node at {node_url}: {str(e)}")
+
+
 def register_hub_handlers(app, hub):
     """Register FastAPI WebSocket handlers for hub mode"""
     
@@ -36,6 +79,211 @@ async def websocket_endpoint(websocket: WebSocket):
             logger.debug(f'Dashboard client disconnected: {e}')
         finally:
             websocket_connections.discard(websocket)
+    
+    # Hub GPU Disconnect API Endpoints
+    @app.get("/api/hub/nodes")
+    async def get_hub_nodes():
+        """Get list of connected nodes and their status"""
+        try:
+            nodes_info = {}
+            for node_name, node_data in hub.nodes.items():
+                nodes_info[node_name] = {
+                    'url': node_data['url'],
+                    'status': node_data['status'],
+                    'last_update': node_data['last_update']
+                }
+            
+            return {
+                'total_nodes': len(hub.nodes),
+                'online_nodes': sum(1 for n in hub.nodes.values() if n['status'] == 'online'),
+                'nodes': nodes_info
+            }
+            
+        except Exception as e:
+            logger.error(f"Error getting hub nodes: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.get("/api/hub/gpu/{node_name}/{gpu_id}/disconnect/methods")
+    async def get_node_disconnect_methods(node_name: str, gpu_id: int):
+        """Get available disconnect methods for a GPU on a specific node"""
+        try:
+            if node_name not in hub.nodes:
+                raise HTTPException(status_code=404, detail=f"Node '{node_name}' not found")
+            
+            node_data = hub.nodes[node_name]
+            if node_data['status'] != 'online':
+                raise HTTPException(status_code=503, detail=f"Node '{node_name}' is offline")
+            
+            node_url = node_data['url']
+            endpoint = f"api/gpu/{gpu_id}/disconnect/methods"
+            
+            result = await forward_to_node(node_url, endpoint, "GET")
+            result['node_name'] = node_name
+            
+            return result
+            
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Error getting disconnect methods for {node_name}/GPU {gpu_id}: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.post("/api/hub/gpu/{node_name}/{gpu_id}/disconnect")
+    async def disconnect_node_gpu(node_name: str, gpu_id: int, request: HubDisconnectRequest):
+        """Disconnect a GPU on a specific node"""
+        try:
+            logger.info(f"Hub received disconnect request for {node_name}/GPU {gpu_id}")
+            
+            if node_name not in hub.nodes:
+                raise HTTPException(status_code=404, detail=f"Node '{node_name}' not found")
+            
+            node_data = hub.nodes[node_name]
+            if node_data['status'] != 'online':
+                raise HTTPException(status_code=503, detail=f"Node '{node_name}' is offline")
+            
+            node_url = node_data['url']
+            endpoint = f"api/gpu/{gpu_id}/disconnect"
+            request_data = {
+                'method': request.method,
+                'down_time': request.down_time
+            }
+            
+            result = await forward_to_node(node_url, endpoint, "POST", request_data)
+            result['node_name'] = node_name
+            result['hub_timestamp'] = datetime.now().isoformat()
+            
+            logger.info(f"Successfully relayed disconnect request to {node_name}/GPU {gpu_id}")
+            return JSONResponse(content=result)
+            
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Error disconnecting {node_name}/GPU {gpu_id}: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.post("/api/hub/gpu/disconnect-multiple")
+    async def disconnect_multiple_node_gpus(request: HubMultiDisconnectRequest):
+        """Disconnect multiple GPUs across multiple nodes"""
+        try:
+            logger.info(f"Hub received multi-disconnect request for {len(request.targets)} targets")
+            
+            # Group targets by node
+            node_targets = {}
+            for target in request.targets:
+                node_name = target.get('node_name')
+                gpu_id = target.get('gpu_id')
+                
+                if not node_name or gpu_id is None:
+                    raise HTTPException(status_code=400, detail="Each target must have 'node_name' and 'gpu_id'")
+                
+                if node_name not in hub.nodes:
+                    raise HTTPException(status_code=404, detail=f"Node '{node_name}' not found")
+                
+                if node_name not in node_targets:
+                    node_targets[node_name] = []
+                node_targets[node_name].append(gpu_id)
+            
+            # Check all nodes are online
+            for node_name in node_targets:
+                if hub.nodes[node_name]['status'] != 'online':
+                    raise HTTPException(status_code=503, detail=f"Node '{node_name}' is offline")
+            
+            # Create tasks for each node
+            tasks = []
+            for node_name, gpu_ids in node_targets.items():
+                node_url = hub.nodes[node_name]['url']
+                
+                if len(gpu_ids) == 1:
+                    # Single GPU disconnect
+                    endpoint = f"api/gpu/{gpu_ids[0]}/disconnect"
+                    request_data = {
+                        'method': request.method,
+                        'down_time': request.down_time
+                    }
+                else:
+                    # Multi-GPU disconnect on same node
+                    endpoint = "api/gpu/disconnect-multiple"
+                    request_data = {
+                        'gpu_indices': gpu_ids,
+                        'method': request.method,
+                        'down_time': request.down_time
+                    }
+                
+                task = asyncio.create_task(
+                    forward_to_node(node_url, endpoint, "POST", request_data),
+                    name=f"disconnect_{node_name}"
+                )
+                tasks.append((node_name, task))
+            
+            # Wait for all tasks to complete
+            results = {}
+            errors = {}
+            
+            for node_name, task in tasks:
+                try:
+                    result = await task
+                    result['node_name'] = node_name
+                    results[node_name] = result
+                except Exception as e:
+                    errors[node_name] = str(e)
+                    logger.error(f"Error disconnecting GPUs on {node_name}: {e}")
+            
+            response = {
+                'total_nodes': len(node_targets),
+                'successful_nodes': len(results),
+                'failed_nodes': len(errors),
+                'results': results,
+                'errors': errors,
+                'hub_timestamp': datetime.now().isoformat()
+            }
+            
+            logger.info(f"Multi-disconnect completed: {len(results)} successful, {len(errors)} failed")
+            return JSONResponse(content=response)
+            
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Error in hub multi-disconnect: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.get("/api/hub/gpu/disconnect/status")
+    async def get_hub_disconnect_status():
+        """Get disconnect capability status for all nodes"""
+        try:
+            node_status = {}
+            
+            for node_name, node_data in hub.nodes.items():
+                if node_data['status'] == 'online':
+                    try:
+                        node_url = node_data['url']
+                        result = await forward_to_node(node_url, "api/gpu/disconnect/status", "GET")
+                        node_status[node_name] = {
+                            'status': 'online',
+                            'capabilities': result
+                        }
+                    except Exception as e:
+                        node_status[node_name] = {
+                            'status': 'error',
+                            'error': str(e)
+                        }
+                else:
+                    node_status[node_name] = {
+                        'status': 'offline'
+                    }
+            
+            total_ready = sum(1 for status in node_status.values() 
+                            if status.get('capabilities', {}).get('ready', False))
+            
+            return {
+                'hub_ready': total_ready > 0,
+                'total_nodes': len(hub.nodes),
+                'ready_nodes': total_ready,
+                'node_status': node_status
+            }
+            
+        except Exception as e:
+            logger.error(f"Error getting hub disconnect status: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
 
 
 async def hub_loop(hub, connections):
diff --git a/docker-compose.yml b/docker-compose.yml
index 313a3af..1e20a0b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,6 +15,11 @@ services:
             - driver: nvidia
               count: all
               capabilities: [gpu]
+    # Required for GPU disconnect functionality
+    privileged: true
+    volumes:
+      - /sys/bus/pci:/sys/bus/pci:rw
+      - /sys/devices:/sys/devices:ro
     init: true
     pid: "host"
     restart: unless-stopped
diff --git a/requirements.txt b/requirements.txt
index a770860..a7b7cc4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ websockets==12.0
 psutil==5.9.6
 nvidia-ml-py==13.580.82
 requests==2.31.0
-websocket-client==1.6.3
\ No newline at end of file
+websocket-client==1.6.3
+aiohttp==3.9.1
\ No newline at end of file
diff --git a/static/css/disconnect-controls.css b/static/css/disconnect-controls.css
new file mode 100644
index 0000000..ab2b5ca
--- /dev/null
+++ b/static/css/disconnect-controls.css
@@ -0,0 +1,613 @@
+/* GPU Disconnect Controls Styles */
+
+/* Disconnect Button Styling */
+.disconnect-button {
+    background: linear-gradient(135deg, #ff6b6b, #ee5a52);
+    color: white;
+    border: none;
+    border-radius: 8px;
+    padding: 8px 16px;
+    font-size: 0.9rem;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    display: inline-flex;
+    align-items: center;
+    gap: 6px;
+    margin-top: 8px;
+}
+
+.disconnect-button:hover:not(:disabled) {
+    background: linear-gradient(135deg, #ff5252, #e53935);
+    transform: translateY(-1px);
+    box-shadow: 0 4px 12px rgba(255, 107, 107, 0.3);
+}
+
+.disconnect-button:active:not(:disabled) {
+    transform: translateY(0);
+}
+
+.disconnect-button:disabled {
+    background: #ccc;
+    cursor: not-allowed;
+    transform: none;
+    box-shadow: none;
+}
+
+.disconnect-icon {
+    font-size: 1rem;
+    display: inline-block;
+}
+
+/* GPU Actions Container */
+.gpu-actions {
+    display: flex;
+    gap: 8px;
+    flex-wrap: wrap;
+    margin-top: 12px;
+    padding-top: 12px;
+    border-top: 1px solid rgba(255, 255, 255, 0.1);
+}
+
+/* GPU Selection Checkbox */
+.gpu-select-container {
+    z-index: 10;
+}
+
+.gpu-select-container label {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    font-size: 0.85rem;
+    color: rgba(255, 255, 255, 0.8);
+    cursor: pointer;
+    padding: 4px 8px;
+    border-radius: 4px;
+    background: rgba(0, 0, 0, 0.3);
+    transition: background 0.2s ease;
+}
+
+.gpu-select-container label:hover {
+    background: rgba(0, 0, 0, 0.5);
+}
+
+.gpu-select-checkbox {
+    margin: 0;
+    transform: scale(1.1);
+}
+
+/* Multi-Select Toolbar */
+.multi-select-toolbar {
+    position: fixed;
+    bottom: 20px;
+    left: 50%;
+    transform: translateX(-50%);
+    background: rgba(45, 45, 45, 0.95);
+    backdrop-filter: blur(10px);
+    border: 1px solid rgba(255, 255, 255, 0.1);
+    border-radius: 12px;
+    padding: 16px 24px;
+    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
+    z-index: 1000;
+    display: none;
+    animation: slideUp 0.3s ease;
+}
+
+@keyframes slideUp {
+    from {
+        opacity: 0;
+        transform: translateX(-50%) translateY(20px);
+    }
+    to {
+        opacity: 1;
+        transform: translateX(-50%) translateY(0);
+    }
+}
+
+.toolbar-content {
+    display: flex;
+    align-items: center;
+    gap: 20px;
+    color: white;
+    font-weight: 500;
+}
+
+.toolbar-actions {
+    display: flex;
+    gap: 12px;
+}
+
+.selected-count {
+    color: #4fc3f7;
+    font-weight: 600;
+}
+
+/* Modal Styles */
+.modal-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background: rgba(0, 0, 0, 0.7);
+    backdrop-filter: blur(4px);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    z-index: 10000;
+    opacity: 0;
+    transition: opacity 0.2s ease;
+}
+
+.disconnect-modal {
+    background: linear-gradient(135deg, #2a2a2a, #1e1e1e);
+    border: 1px solid rgba(255, 255, 255, 0.1);
+    border-radius: 16px;
+    min-width: 480px;
+    max-width: 90vw;
+    max-height: 90vh;
+    overflow: hidden;
+    transform: scale(0.8);
+    transition: transform 0.2s ease;
+    box-shadow: 0 20px 60px rgba(0, 0, 0, 0.4);
+}
+
+.multi-disconnect-modal {
+    min-width: 560px;
+}
+
+.modal-header {
+    background: linear-gradient(135deg, #333, #2a2a2a);
+    padding: 20px 24px;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+}
+
+.modal-header h3 {
+    margin: 0;
+    color: white;
+    font-size: 1.25rem;
+    font-weight: 600;
+}
+
+.modal-close {
+    background: none;
+    border: none;
+    color: rgba(255, 255, 255, 0.6);
+    font-size: 1.5rem;
+    cursor: pointer;
+    padding: 0;
+    width: 32px;
+    height: 32px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    border-radius: 50%;
+    transition: all 0.2s ease;
+}
+
+.modal-close:hover {
+    background: rgba(255, 255, 255, 0.1);
+    color: white;
+}
+
+.modal-content {
+    padding: 24px;
+    color: white;
+}
+
+.disconnect-warning {
+    background: linear-gradient(135deg, rgba(255, 193, 7, 0.1), rgba(255, 152, 0, 0.1));
+    border: 1px solid rgba(255, 193, 7, 0.3);
+    border-radius: 8px;
+    padding: 16px;
+    margin-bottom: 20px;
+    display: flex;
+    gap: 12px;
+    align-items: flex-start;
+}
+
+.multi-warning {
+    background: linear-gradient(135deg, rgba(244, 67, 54, 0.15), rgba(233, 30, 99, 0.1));
+    border-color: rgba(244, 67, 54, 0.4);
+}
+
+.warning-icon {
+    font-size: 1.2rem;
+    flex-shrink: 0;
+}
+
+.warning-text {
+    line-height: 1.5;
+}
+
+.warning-text strong {
+    color: #ffeb3b;
+}
+
+/* Method Selection */
+.method-selection {
+    margin-bottom: 20px;
+}
+
+.method-selection label {
+    display: block;
+    margin-bottom: 8px;
+    font-weight: 500;
+    color: rgba(255, 255, 255, 0.9);
+}
+
+.method-selection select {
+    width: 100%;
+    background: rgba(255, 255, 255, 0.1);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+    border-radius: 8px;
+    padding: 12px;
+    color: white;
+    font-size: 0.9rem;
+    margin-bottom: 8px;
+}
+
+.method-selection select:focus {
+    outline: none;
+    border-color: #4fc3f7;
+    box-shadow: 0 0 0 2px rgba(79, 195, 247, 0.2);
+}
+
+.method-description {
+    font-size: 0.85rem;
+    color: rgba(255, 255, 255, 0.7);
+    line-height: 1.4;
+    padding: 8px 12px;
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 6px;
+}
+
+/* Timing Controls */
+.timing-controls {
+    margin-bottom: 20px;
+}
+
+.timing-controls label {
+    display: block;
+    margin-bottom: 8px;
+    font-weight: 500;
+    color: rgba(255, 255, 255, 0.9);
+}
+
+.time-options {
+    display: flex;
+    gap: 8px;
+    flex-wrap: wrap;
+}
+
+.time-btn {
+    background: rgba(255, 255, 255, 0.1);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+    border-radius: 6px;
+    padding: 8px 14px;
+    color: rgba(255, 255, 255, 0.8);
+    cursor: pointer;
+    transition: all 0.2s ease;
+    font-size: 0.85rem;
+}
+
+.time-btn:hover {
+    background: rgba(255, 255, 255, 0.15);
+    border-color: rgba(255, 255, 255, 0.3);
+}
+
+.time-btn.active {
+    background: linear-gradient(135deg, #4fc3f7, #29b6f6);
+    border-color: #4fc3f7;
+    color: white;
+}
+
+#custom-time, #multi-custom-time {
+    background: rgba(255, 255, 255, 0.1);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+    border-radius: 6px;
+    padding: 8px 12px;
+    color: white;
+    width: 120px;
+    font-size: 0.85rem;
+}
+
+#custom-time:focus, #multi-custom-time:focus {
+    outline: none;
+    border-color: #4fc3f7;
+    box-shadow: 0 0 0 2px rgba(79, 195, 247, 0.2);
+}
+
+/* Selected GPUs Display */
+.selected-gpus {
+    margin-bottom: 20px;
+}
+
+.selected-gpus label {
+    display: block;
+    margin-bottom: 8px;
+    font-weight: 500;
+    color: rgba(255, 255, 255, 0.9);
+}
+
+.gpu-list {
+    background: rgba(255, 255, 255, 0.05);
+    border: 1px solid rgba(255, 255, 255, 0.1);
+    border-radius: 8px;
+    padding: 12px;
+    font-size: 0.9rem;
+    color: rgba(255, 255, 255, 0.8);
+    max-height: 100px;
+    overflow-y: auto;
+}
+
+/* Active Processes Warning */
+.active-processes-warning {
+    background: linear-gradient(135deg, rgba(33, 150, 243, 0.1), rgba(3, 169, 244, 0.1));
+    border: 1px solid rgba(33, 150, 243, 0.3);
+    border-radius: 8px;
+    padding: 16px;
+    margin-bottom: 20px;
+    display: flex;
+    gap: 12px;
+    align-items: flex-start;
+}
+
+/* Modal Actions */
+.modal-actions {
+    background: rgba(255, 255, 255, 0.05);
+    padding: 20px 24px;
+    display: flex;
+    gap: 12px;
+    justify-content: flex-end;
+    border-top: 1px solid rgba(255, 255, 255, 0.1);
+}
+
+.btn-secondary {
+    background: rgba(255, 255, 255, 0.1);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+    color: rgba(255, 255, 255, 0.8);
+    padding: 10px 20px;
+    border-radius: 8px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    font-weight: 500;
+}
+
+.btn-secondary:hover {
+    background: rgba(255, 255, 255, 0.15);
+    color: white;
+}
+
+.btn-danger {
+    background: linear-gradient(135deg, #f44336, #d32f2f);
+    border: none;
+    color: white;
+    padding: 10px 20px;
+    border-radius: 8px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    font-weight: 500;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+
+.btn-danger:hover {
+    background: linear-gradient(135deg, #e53935, #c62828);
+    box-shadow: 0 4px 12px rgba(244, 67, 54, 0.3);
+}
+
+/* GPU Status Indicators */
+.disconnect-status {
+    position: absolute;
+    top: 12px;
+    right: 12px;
+    background: rgba(0, 0, 0, 0.8);
+    color: white;
+    padding: 4px 8px;
+    border-radius: 4px;
+    font-size: 0.75rem;
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    z-index: 5;
+}
+
+.gpu-card {
+    position: relative;
+}
+
+.gpu-card.disconnecting {
+    opacity: 0.7;
+    border-color: #ff9800 !important;
+}
+
+.gpu-card.disconnect-completed {
+    animation: successPulse 2s ease;
+}
+
+.gpu-card.disconnect-failed {
+    border-color: #f44336 !important;
+    animation: errorShake 0.5s ease;
+}
+
+@keyframes successPulse {
+    0%, 100% { border-color: inherit; }
+    50% { border-color: #4caf50; }
+}
+
+@keyframes errorShake {
+    0%, 100% { transform: translateX(0); }
+    25% { transform: translateX(-2px); }
+    75% { transform: translateX(2px); }
+}
+
+.status-spinner {
+    display: inline-block;
+    width: 12px;
+    height: 12px;
+    border: 2px solid transparent;
+    border-top: 2px solid #4fc3f7;
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+}
+
+@keyframes spin {
+    to { transform: rotate(360deg); }
+}
+
+.status-success {
+    color: #4caf50;
+    font-weight: bold;
+}
+
+.status-error {
+    color: #f44336;
+    font-weight: bold;
+}
+
+/* Button Spinner */
+.btn-spinner {
+    display: inline-block;
+    width: 14px;
+    height: 14px;
+    border: 2px solid transparent;
+    border-top: 2px solid currentColor;
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+}
+
+/* Notifications */
+.notification-container {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    z-index: 10001;
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+
+.notification {
+    background: rgba(45, 45, 45, 0.95);
+    backdrop-filter: blur(10px);
+    border-radius: 8px;
+    padding: 0;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+    border-left: 4px solid;
+    animation: slideInRight 0.3s ease;
+    max-width: 400px;
+}
+
+.notification-info {
+    border-left-color: #2196f3;
+}
+
+.notification-success {
+    border-left-color: #4caf50;
+}
+
+.notification-warning {
+    border-left-color: #ff9800;
+}
+
+.notification-error {
+    border-left-color: #f44336;
+}
+
+@keyframes slideInRight {
+    from {
+        opacity: 0;
+        transform: translateX(100%);
+    }
+    to {
+        opacity: 1;
+        transform: translateX(0);
+    }
+}
+
+.notification-content {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 12px 16px;
+    color: white;
+}
+
+.notification-message {
+    flex: 1;
+    font-size: 0.9rem;
+}
+
+.notification-close {
+    background: none;
+    border: none;
+    color: rgba(255, 255, 255, 0.6);
+    cursor: pointer;
+    font-size: 1.2rem;
+    padding: 0;
+    margin-left: 12px;
+    width: 20px;
+    height: 20px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    border-radius: 50%;
+    transition: all 0.2s ease;
+}
+
+.notification-close:hover {
+    background: rgba(255, 255, 255, 0.1);
+    color: white;
+}
+
+/* Responsive Design */
+@media (max-width: 768px) {
+    .disconnect-modal {
+        min-width: 90vw;
+        margin: 20px;
+    }
+    
+    .multi-select-toolbar {
+        left: 10px;
+        right: 10px;
+        transform: none;
+        border-radius: 8px;
+    }
+    
+    .toolbar-content {
+        flex-direction: column;
+        gap: 12px;
+        text-align: center;
+    }
+    
+    .time-options {
+        justify-content: center;
+    }
+    
+    .notification-container {
+        left: 10px;
+        right: 10px;
+        top: 10px;
+    }
+    
+    .notification {
+        max-width: none;
+    }
+}
+
+/* Dark mode adjustments */
+@media (prefers-color-scheme: dark) {
+    .disconnect-modal {
+        background: linear-gradient(135deg, #1a1a1a, #0d1117);
+        border-color: rgba(255, 255, 255, 0.1);
+    }
+    
+    .modal-header {
+        background: linear-gradient(135deg, #21262d, #1a1a1a);
+    }
+}
diff --git a/static/js/gpu-cards.js b/static/js/gpu-cards.js
index 889dd34..1b1b24d 100644
--- a/static/js/gpu-cards.js
+++ b/static/js/gpu-cards.js
@@ -9,7 +9,13 @@ function createOverviewCard(gpuId, gpuInfo) {
     const memPercent = (memory_used / memory_total) * 100;
 
     return `
-        <div class="overview-gpu-card" data-gpu-id="${gpuId}" onclick="switchToView('gpu-${gpuId}')" style="pointer-events: auto;">
+        <div class="overview-gpu-card" data-gpu-id="${gpuId}" onclick="switchToView('gpu-${gpuId}')" style="pointer-events: auto; position: relative;">
+            <div class="gpu-select-container" style="position: absolute; top: 10px; right: 10px; z-index: 10;">
+                <label onclick="event.stopPropagation();">
+                    <input type="checkbox" class="gpu-select-checkbox" data-gpu-id="${gpuId}">
+                    Select
+                </label>
+            </div>
             <div class="overview-header">
                 <div>
                     <h2 style="font-size: 1.5rem; font-weight: 700; background: var(--primary-gradient); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; margin-bottom: 0.25rem;">
@@ -631,6 +637,19 @@ function createGPUCard(gpuId, gpuInfo) {
                     <canvas id="chart-appclocks-${gpuId}"></canvas>
                 </div>` : ''}
             </div>
+            
+            <!-- GPU Actions Section -->
+            <div class="gpu-actions">
+                <div class="gpu-select-container">
+                    <label>
+                        <input type="checkbox" class="gpu-select-checkbox" data-gpu-id="${gpuId}">
+                        Select
+                    </label>
+                </div>
+                <button class="disconnect-button" onclick="showDisconnectModal(${gpuId})">
+                    <span class="disconnect-icon">⚡</span> Disconnect
+                </button>
+            </div>
         </div>
     `;
 }
diff --git a/static/js/gpu-disconnect.js b/static/js/gpu-disconnect.js
new file mode 100644
index 0000000..5ca469a
--- /dev/null
+++ b/static/js/gpu-disconnect.js
@@ -0,0 +1,963 @@
+/**
+ * GPU Disconnect Controls - Frontend functionality for GPU disconnect operations
+ * Handles method selection modals, confirmations, and status updates
+ */
+
+// Global state for disconnect operations
+let disconnectState = {
+    currentGpu: null,
+    selectedGpus: new Set(),
+    disconnectMethods: {},
+    systemCapabilities: null,
+    hubMode: false,
+    nodeInfo: {}
+};
+
+// Disconnect operation status
+let activeDisconnects = new Map(); // gpuId -> {status, startTime, method}
+
+/**
+ * Initialize disconnect controls
+ */
+function initDisconnectControls() {
+    console.log('Initializing GPU disconnect controls');
+    
+    // Check system capabilities
+    checkDisconnectCapabilities();
+    
+    // Setup UI event listeners
+    setupDisconnectEventListeners();
+    
+    // Check if we're in hub mode
+    detectHubMode();
+}
+
+/**
+ * Check system disconnect capabilities
+ */
+async function checkDisconnectCapabilities() {
+    try {
+        let endpoint = disconnectState.hubMode ? '/api/hub/gpu/disconnect/status' : '/api/gpu/disconnect/status';
+        const response = await fetch(endpoint);
+        const data = await response.json();
+        
+        disconnectState.systemCapabilities = data;
+        console.log('System disconnect capabilities:', data);
+        
+        // Update UI based on capabilities
+        updateDisconnectUI();
+        
+    } catch (error) {
+        console.error('Error checking disconnect capabilities:', error);
+        disconnectState.systemCapabilities = { ready: false };
+        updateDisconnectUI();
+    }
+}
+
+/**
+ * Detect if we're in hub mode
+ */
+function detectHubMode() {
+    // Check if we have hub-specific data in the page
+    disconnectState.hubMode = window.location.pathname.includes('hub') || 
+                              document.body.classList.contains('hub-mode') ||
+                              (window.currentData && window.currentData.mode === 'hub');
+    
+    if (disconnectState.hubMode) {
+        console.log('Running in hub mode - enabling hub disconnect features');
+        loadNodeInfo();
+    }
+}
+
+/**
+ * Load node information for hub mode
+ */
+async function loadNodeInfo() {
+    try {
+        const response = await fetch('/api/hub/nodes');
+        const data = await response.json();
+        disconnectState.nodeInfo = data;
+        console.log('Node info loaded:', data);
+    } catch (error) {
+        console.error('Error loading node info:', error);
+    }
+}
+
+/**
+ * Setup event listeners for disconnect controls
+ */
+function setupDisconnectEventListeners() {
+    // Listen for modal close events
+    document.addEventListener('click', (e) => {
+        if (e.target.classList.contains('modal-overlay')) {
+            closeDisconnectModal();
+        }
+    });
+    
+    // Listen for ESC key to close modals
+    document.addEventListener('keydown', (e) => {
+        if (e.key === 'Escape') {
+            closeDisconnectModal();
+        }
+    });
+    
+    // Listen for multi-select changes
+    document.addEventListener('change', (e) => {
+        if (e.target.classList.contains('gpu-select-checkbox')) {
+            handleGPUSelection(e);
+        }
+    });
+}
+
+/**
+ * Add disconnect button to a GPU card
+ */
+function addDisconnectButton(gpuId, gpuCard, nodeInfo = null) {
+    // Check if button already exists
+    if (gpuCard.querySelector('.disconnect-button')) {
+        return;
+    }
+    
+    // Create disconnect button
+    const disconnectBtn = document.createElement('button');
+    disconnectBtn.className = 'disconnect-button';
+    disconnectBtn.innerHTML = '<span class="disconnect-icon">⚡</span> Disconnect';
+    disconnectBtn.onclick = () => showDisconnectModal(gpuId, nodeInfo);
+    
+    // Add to GPU card actions area
+    let actionsArea = gpuCard.querySelector('.gpu-actions');
+    if (!actionsArea) {
+        actionsArea = document.createElement('div');
+        actionsArea.className = 'gpu-actions';
+        gpuCard.appendChild(actionsArea);
+    }
+    
+    actionsArea.appendChild(disconnectBtn);
+    
+    // Update button state based on system capabilities
+    updateDisconnectButtonState(disconnectBtn, gpuId);
+}
+
+/**
+ * Add multi-select checkbox to GPU card
+ */
+function addGPUSelectCheckbox(gpuId, gpuCard, nodeInfo = null) {
+    // Check if checkbox already exists
+    if (gpuCard.querySelector('.gpu-select-checkbox')) {
+        return;
+    }
+    
+    // Create checkbox container
+    const checkboxContainer = document.createElement('div');
+    checkboxContainer.className = 'gpu-select-container';
+    
+    const checkbox = document.createElement('input');
+    checkbox.type = 'checkbox';
+    checkbox.className = 'gpu-select-checkbox';
+    checkbox.dataset.gpuId = gpuId;
+    if (nodeInfo) {
+        checkbox.dataset.nodeName = nodeInfo.node_name;
+    }
+    
+    const label = document.createElement('label');
+    label.appendChild(checkbox);
+    label.appendChild(document.createTextNode(' Select'));
+    
+    checkboxContainer.appendChild(label);
+    
+    // Add to GPU card header
+    const header = gpuCard.querySelector('.gpu-header') || gpuCard.querySelector('h3');
+    if (header) {
+        header.style.position = 'relative';
+        checkboxContainer.style.position = 'absolute';
+        checkboxContainer.style.right = '10px';
+        checkboxContainer.style.top = '10px';
+        header.appendChild(checkboxContainer);
+    }
+}
+
+/**
+ * Show disconnect modal for a specific GPU
+ */
+async function showDisconnectModal(gpuId, nodeInfo = null) {
+    disconnectState.currentGpu = { id: gpuId, node: nodeInfo };
+    
+    try {
+        // Get available methods
+        const methods = await getAvailableMethods(gpuId, nodeInfo);
+        disconnectState.disconnectMethods[gpuId] = methods;
+        
+        // Create and show modal
+        const modal = createDisconnectModal(gpuId, methods, nodeInfo);
+        document.body.appendChild(modal);
+        
+        // Animate modal in
+        requestAnimationFrame(() => {
+            modal.style.opacity = '1';
+            modal.querySelector('.disconnect-modal').style.transform = 'scale(1)';
+        });
+        
+    } catch (error) {
+        console.error('Error showing disconnect modal:', error);
+        showNotification(`Error loading disconnect options: ${error.message}`, 'error');
+    }
+}
+
+/**
+ * Show multi-GPU disconnect modal
+ */
+function showMultiDisconnectModal() {
+    if (disconnectState.selectedGpus.size === 0) {
+        showNotification('Please select at least one GPU', 'warning');
+        return;
+    }
+    
+    const selectedArray = Array.from(disconnectState.selectedGpus);
+    console.log('Showing multi-disconnect modal for:', selectedArray);
+    
+    const modal = createMultiDisconnectModal(selectedArray);
+    document.body.appendChild(modal);
+    
+    // Animate modal in
+    requestAnimationFrame(() => {
+        modal.style.opacity = '1';
+        modal.querySelector('.disconnect-modal').style.transform = 'scale(1)';
+    });
+}
+
+/**
+ * Create disconnect modal HTML
+ */
+function createDisconnectModal(gpuId, methods, nodeInfo) {
+    const modalHtml = `
+        <div class="modal-overlay disconnect-modal-overlay">
+            <div class="disconnect-modal">
+                <div class="modal-header">
+                    <h3>Disconnect ${nodeInfo ? `${nodeInfo.node_name}/` : ''}GPU ${gpuId}</h3>
+                    <button class="modal-close" onclick="closeDisconnectModal()">×</button>
+                </div>
+                
+                <div class="modal-content">
+                    <div class="disconnect-warning">
+                        <div class="warning-icon">⚠️</div>
+                        <div class="warning-text">
+                            <strong>Caution:</strong> This will temporarily disconnect the GPU, interrupting any running processes.
+                            The GPU will automatically reconnect after the specified time.
+                        </div>
+                    </div>
+                    
+                    <div class="method-selection">
+                        <label>Disconnect Method:</label>
+                        <select id="disconnect-method-select">
+                            ${methods.map(method => `
+                                <option value="${method}">${formatMethodName(method)}</option>
+                            `).join('')}
+                        </select>
+                        <div class="method-description" id="method-description">
+                            ${getMethodDescription(methods[0])}
+                        </div>
+                    </div>
+                    
+                    <div class="timing-controls">
+                        <label>Disconnect Duration:</label>
+                        <div class="time-options">
+                            <button class="time-btn active" data-time="5">5 sec</button>
+                            <button class="time-btn" data-time="10">10 sec</button>
+                            <button class="time-btn" data-time="30">30 sec</button>
+                            <button class="time-btn" data-time="60">1 min</button>
+                            <input type="number" id="custom-time" placeholder="Custom (sec)" min="1" max="300">
+                        </div>
+                    </div>
+                    
+                    <div class="active-processes-warning" id="processes-warning" style="display: none;">
+                        <div class="warning-icon">🔄</div>
+                        <div class="warning-text">
+                            <strong>Active Processes Detected:</strong> This GPU may have running processes that will be interrupted.
+                        </div>
+                    </div>
+                </div>
+                
+                <div class="modal-actions">
+                    <button class="btn-secondary" onclick="closeDisconnectModal()">Cancel</button>
+                    <button class="btn-danger" onclick="executeDisconnect()">
+                        <span class="disconnect-icon">⚡</span> Disconnect GPU
+                    </button>
+                </div>
+            </div>
+        </div>
+    `;
+    
+    const modal = document.createElement('div');
+    modal.innerHTML = modalHtml;
+    const modalElement = modal.firstElementChild;
+    
+    // Setup event listeners
+    setupModalEventListeners(modalElement);
+    
+    return modalElement;
+}
+
+/**
+ * Create multi-GPU disconnect modal
+ */
+function createMultiDisconnectModal(selectedGpus) {
+    const gpuList = selectedGpus.map(gpu => {
+        if (typeof gpu === 'object') {
+            return `${gpu.node || 'local'}/${gpu.id}`;
+        }
+        return `GPU ${gpu}`;
+    }).join(', ');
+    
+    const modalHtml = `
+        <div class="modal-overlay disconnect-modal-overlay">
+            <div class="disconnect-modal multi-disconnect-modal">
+                <div class="modal-header">
+                    <h3>Disconnect Multiple GPUs</h3>
+                    <button class="modal-close" onclick="closeDisconnectModal()">×</button>
+                </div>
+                
+                <div class="modal-content">
+                    <div class="selected-gpus">
+                        <label>Selected GPUs (${selectedGpus.length}):</label>
+                        <div class="gpu-list">${gpuList}</div>
+                    </div>
+                    
+                    <div class="disconnect-warning multi-warning">
+                        <div class="warning-icon">⚠️</div>
+                        <div class="warning-text">
+                            <strong>Mass Disconnect Warning:</strong> This will disconnect <strong>${selectedGpus.length} GPUs</strong> simultaneously.
+                            All running processes on these GPUs will be interrupted.
+                        </div>
+                    </div>
+                    
+                    <div class="method-selection">
+                        <label>Disconnect Method:</label>
+                        <select id="multi-disconnect-method-select">
+                            <option value="auto">Auto (Best Available)</option>
+                            <option value="logical">Logical Remove/Rescan</option>
+                            <option value="hot">Hot Reset</option>
+                            <option value="slot">Slot Power (if available)</option>
+                        </select>
+                        <div class="method-description" id="multi-method-description">
+                            ${getMethodDescription('auto')}
+                        </div>
+                    </div>
+                    
+                    <div class="timing-controls">
+                        <label>Disconnect Duration:</label>
+                        <div class="time-options">
+                            <button class="time-btn active" data-time="5">5 sec</button>
+                            <button class="time-btn" data-time="10">10 sec</button>
+                            <button class="time-btn" data-time="30">30 sec</button>
+                            <input type="number" id="multi-custom-time" placeholder="Custom (sec)" min="1" max="300">
+                        </div>
+                    </div>
+                </div>
+                
+                <div class="modal-actions">
+                    <button class="btn-secondary" onclick="closeDisconnectModal()">Cancel</button>
+                    <button class="btn-danger" onclick="executeMultiDisconnect()">
+                        <span class="disconnect-icon">⚡</span> Disconnect All Selected
+                    </button>
+                </div>
+            </div>
+        </div>
+    `;
+    
+    const modal = document.createElement('div');
+    modal.innerHTML = modalHtml;
+    const modalElement = modal.firstElementChild;
+    
+    // Setup event listeners
+    setupModalEventListeners(modalElement);
+    
+    return modalElement;
+}
+
+/**
+ * Setup modal event listeners
+ */
+function setupModalEventListeners(modal) {
+    // Method selection change
+    const methodSelect = modal.querySelector('#disconnect-method-select, #multi-disconnect-method-select');
+    if (methodSelect) {
+        methodSelect.addEventListener('change', (e) => {
+            const description = modal.querySelector('#method-description, #multi-method-description');
+            if (description) {
+                description.textContent = getMethodDescription(e.target.value);
+            }
+        });
+    }
+    
+    // Time button selection
+    modal.querySelectorAll('.time-btn').forEach(btn => {
+        btn.addEventListener('click', (e) => {
+            modal.querySelectorAll('.time-btn').forEach(b => b.classList.remove('active'));
+            e.target.classList.add('active');
+            
+            // Clear custom input
+            const customInput = modal.querySelector('#custom-time, #multi-custom-time');
+            if (customInput) customInput.value = '';
+        });
+    });
+    
+    // Custom time input
+    const customInput = modal.querySelector('#custom-time, #multi-custom-time');
+    if (customInput) {
+        customInput.addEventListener('input', () => {
+            modal.querySelectorAll('.time-btn').forEach(btn => btn.classList.remove('active'));
+        });
+    }
+}
+
+/**
+ * Close disconnect modal
+ */
+function closeDisconnectModal() {
+    const modal = document.querySelector('.disconnect-modal-overlay');
+    if (modal) {
+        modal.style.opacity = '0';
+        modal.querySelector('.disconnect-modal').style.transform = 'scale(0.8)';
+        setTimeout(() => {
+            modal.remove();
+        }, 200);
+    }
+    
+    disconnectState.currentGpu = null;
+}
+
+/**
+ * Execute single GPU disconnect
+ */
+async function executeDisconnect() {
+    if (!disconnectState.currentGpu) return;
+    
+    const modal = document.querySelector('.disconnect-modal-overlay');
+    const methodSelect = modal.querySelector('#disconnect-method-select');
+    const customTime = modal.querySelector('#custom-time');
+    const activeTimeBtn = modal.querySelector('.time-btn.active');
+    
+    const method = methodSelect.value;
+    const downTime = customTime.value ? parseFloat(customTime.value) : 
+                     activeTimeBtn ? parseFloat(activeTimeBtn.dataset.time) : 5;
+    
+    const gpuId = disconnectState.currentGpu.id;
+    const nodeInfo = disconnectState.currentGpu.node;
+    
+    try {
+        closeDisconnectModal();
+        
+        // Mark as active
+        activeDisconnects.set(gpuId, {
+            status: 'starting',
+            startTime: Date.now(),
+            method: method,
+            downTime: downTime
+        });
+        
+        // Update UI
+        updateGPUDisconnectStatus(gpuId, 'starting');
+        showNotification(`Starting disconnect of ${nodeInfo ? `${nodeInfo.node_name}/` : ''}GPU ${gpuId}...`, 'info');
+        
+        // Execute disconnect
+        const result = await performDisconnect(gpuId, method, downTime, nodeInfo);
+        
+        // Update status
+        activeDisconnects.set(gpuId, {
+            status: 'completed',
+            startTime: activeDisconnects.get(gpuId).startTime,
+            method: method,
+            result: result
+        });
+        
+        updateGPUDisconnectStatus(gpuId, 'completed');
+        showNotification(`GPU ${gpuId} disconnect completed successfully`, 'success');
+        
+        // Clear status after delay
+        setTimeout(() => {
+            activeDisconnects.delete(gpuId);
+            updateGPUDisconnectStatus(gpuId, 'idle');
+        }, 5000);
+        
+    } catch (error) {
+        console.error('Disconnect failed:', error);
+        
+        activeDisconnects.set(gpuId, {
+            status: 'failed',
+            startTime: activeDisconnects.get(gpuId)?.startTime || Date.now(),
+            method: method,
+            error: error.message
+        });
+        
+        updateGPUDisconnectStatus(gpuId, 'failed');
+        showNotification(`GPU ${gpuId} disconnect failed: ${error.message}`, 'error');
+        
+        // Clear error status after delay
+        setTimeout(() => {
+            activeDisconnects.delete(gpuId);
+            updateGPUDisconnectStatus(gpuId, 'idle');
+        }, 10000);
+    }
+}
+
+/**
+ * Execute multi-GPU disconnect
+ */
+async function executeMultiDisconnect() {
+    const modal = document.querySelector('.disconnect-modal-overlay');
+    const methodSelect = modal.querySelector('#multi-disconnect-method-select');
+    const customTime = modal.querySelector('#multi-custom-time');
+    const activeTimeBtn = modal.querySelector('.time-btn.active');
+    
+    const method = methodSelect.value;
+    const downTime = customTime.value ? parseFloat(customTime.value) : 
+                     activeTimeBtn ? parseFloat(activeTimeBtn.dataset.time) : 5;
+    
+    const selectedGpus = Array.from(disconnectState.selectedGpus);
+    
+    try {
+        closeDisconnectModal();
+        
+        // Mark all as active
+        selectedGpus.forEach(gpu => {
+            const gpuId = typeof gpu === 'object' ? gpu.id : gpu;
+            activeDisconnects.set(gpuId, {
+                status: 'starting',
+                startTime: Date.now(),
+                method: method,
+                downTime: downTime
+            });
+            updateGPUDisconnectStatus(gpuId, 'starting');
+        });
+        
+        showNotification(`Starting disconnect of ${selectedGpus.length} GPUs...`, 'info');
+        
+        // Execute multi-disconnect
+        const result = await performMultiDisconnect(selectedGpus, method, downTime);
+        
+        // Process results
+        Object.entries(result.results || {}).forEach(([key, res]) => {
+            const gpuId = res.gpu_index;
+            activeDisconnects.set(gpuId, {
+                status: 'completed',
+                startTime: activeDisconnects.get(gpuId).startTime,
+                method: method,
+                result: res
+            });
+            updateGPUDisconnectStatus(gpuId, 'completed');
+        });
+        
+        Object.entries(result.errors || {}).forEach(([key, error]) => {
+            // Extract GPU ID from key or use the key itself
+            const gpuId = key;
+            activeDisconnects.set(gpuId, {
+                status: 'failed',
+                startTime: activeDisconnects.get(gpuId)?.startTime || Date.now(),
+                method: method,
+                error: error
+            });
+            updateGPUDisconnectStatus(gpuId, 'failed');
+        });
+        
+        const successful = result.successful || 0;
+        const failed = result.failed || 0;
+        
+        if (failed === 0) {
+            showNotification(`All ${successful} GPUs disconnected successfully`, 'success');
+        } else {
+            showNotification(`${successful} GPUs successful, ${failed} failed`, 'warning');
+        }
+        
+        // Clear statuses after delay
+        setTimeout(() => {
+            selectedGpus.forEach(gpu => {
+                const gpuId = typeof gpu === 'object' ? gpu.id : gpu;
+                activeDisconnects.delete(gpuId);
+                updateGPUDisconnectStatus(gpuId, 'idle');
+            });
+        }, 5000);
+        
+        // Clear selection
+        clearGPUSelection();
+        
+    } catch (error) {
+        console.error('Multi-disconnect failed:', error);
+        
+        selectedGpus.forEach(gpu => {
+            const gpuId = typeof gpu === 'object' ? gpu.id : gpu;
+            activeDisconnects.set(gpuId, {
+                status: 'failed',
+                startTime: activeDisconnects.get(gpuId)?.startTime || Date.now(),
+                method: method,
+                error: error.message
+            });
+            updateGPUDisconnectStatus(gpuId, 'failed');
+        });
+        
+        showNotification(`Multi-disconnect failed: ${error.message}`, 'error');
+        
+        // Clear error statuses after delay
+        setTimeout(() => {
+            selectedGpus.forEach(gpu => {
+                const gpuId = typeof gpu === 'object' ? gpu.id : gpu;
+                activeDisconnects.delete(gpuId);
+                updateGPUDisconnectStatus(gpuId, 'idle');
+            });
+        }, 10000);
+    }
+}
+
+/**
+ * Get available disconnect methods for a GPU
+ */
+async function getAvailableMethods(gpuId, nodeInfo) {
+    try {
+        let endpoint;
+        if (nodeInfo && disconnectState.hubMode) {
+            endpoint = `/api/hub/gpu/${nodeInfo.node_name}/${gpuId}/disconnect/methods`;
+        } else {
+            endpoint = `/api/gpu/${gpuId}/disconnect/methods`;
+        }
+        
+        const response = await fetch(endpoint);
+        if (!response.ok) {
+            throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+        }
+        
+        const data = await response.json();
+        return data.available_methods || ['auto'];
+        
+    } catch (error) {
+        console.error('Error getting available methods:', error);
+        return ['auto']; // Fallback
+    }
+}
+
+/**
+ * Perform single GPU disconnect
+ */
+async function performDisconnect(gpuId, method, downTime, nodeInfo) {
+    let endpoint;
+    let requestData = {
+        method: method,
+        down_time: downTime
+    };
+    
+    if (nodeInfo && disconnectState.hubMode) {
+        endpoint = `/api/hub/gpu/${nodeInfo.node_name}/${gpuId}/disconnect`;
+    } else {
+        endpoint = `/api/gpu/${gpuId}/disconnect`;
+    }
+    
+    const response = await fetch(endpoint, {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify(requestData)
+    });
+    
+    if (!response.ok) {
+        const errorData = await response.json().catch(() => ({}));
+        throw new Error(errorData.detail || `HTTP ${response.status}: ${response.statusText}`);
+    }
+    
+    return await response.json();
+}
+
+/**
+ * Perform multi-GPU disconnect
+ */
+async function performMultiDisconnect(selectedGpus, method, downTime) {
+    if (disconnectState.hubMode) {
+        // Hub mode - targets include node information
+        const targets = selectedGpus.map(gpu => {
+            if (typeof gpu === 'object') {
+                return { node_name: gpu.node, gpu_id: gpu.id };
+            } else {
+                return { node_name: 'local', gpu_id: gpu };
+            }
+        });
+        
+        const response = await fetch('/api/hub/gpu/disconnect-multiple', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({
+                targets: targets,
+                method: method,
+                down_time: downTime
+            })
+        });
+        
+        if (!response.ok) {
+            const errorData = await response.json().catch(() => ({}));
+            throw new Error(errorData.detail || `HTTP ${response.status}: ${response.statusText}`);
+        }
+        
+        return await response.json();
+        
+    } else {
+        // Node mode - simple GPU indices
+        const gpuIndices = selectedGpus.map(gpu => typeof gpu === 'object' ? gpu.id : gpu);
+        
+        const response = await fetch('/api/gpu/disconnect-multiple', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({
+                gpu_indices: gpuIndices,
+                method: method,
+                down_time: downTime
+            })
+        });
+        
+        if (!response.ok) {
+            const errorData = await response.json().catch(() => ({}));
+            throw new Error(errorData.detail || `HTTP ${response.status}: ${response.statusText}`);
+        }
+        
+        return await response.json();
+    }
+}
+
+/**
+ * Handle GPU selection checkbox changes
+ */
+function handleGPUSelection(event) {
+    const checkbox = event.target;
+    const gpuId = checkbox.dataset.gpuId;
+    const nodeName = checkbox.dataset.nodeName;
+    
+    const gpuIdentifier = nodeName ? { id: gpuId, node: nodeName } : gpuId;
+    
+    if (checkbox.checked) {
+        disconnectState.selectedGpus.add(gpuIdentifier);
+    } else {
+        disconnectState.selectedGpus.delete(gpuIdentifier);
+    }
+    
+    updateMultiSelectUI();
+}
+
+/**
+ * Clear GPU selection
+ */
+function clearGPUSelection() {
+    disconnectState.selectedGpus.clear();
+    document.querySelectorAll('.gpu-select-checkbox').forEach(cb => {
+        cb.checked = false;
+    });
+    updateMultiSelectUI();
+}
+
+/**
+ * Update multi-select UI
+ */
+function updateMultiSelectUI() {
+    const selectedCount = disconnectState.selectedGpus.size;
+    
+    // Update or create multi-select toolbar
+    let toolbar = document.querySelector('.multi-select-toolbar');
+    
+    if (selectedCount > 0) {
+        if (!toolbar) {
+            toolbar = createMultiSelectToolbar();
+            document.querySelector('.container').appendChild(toolbar);
+        }
+        
+        toolbar.querySelector('.selected-count').textContent = selectedCount;
+        toolbar.style.display = 'flex';
+        
+    } else if (toolbar) {
+        toolbar.style.display = 'none';
+    }
+}
+
+/**
+ * Create multi-select toolbar
+ */
+function createMultiSelectToolbar() {
+    const toolbar = document.createElement('div');
+    toolbar.className = 'multi-select-toolbar';
+    toolbar.innerHTML = `
+        <div class="toolbar-content">
+            <span class="selected-count">0</span> GPUs selected
+            <div class="toolbar-actions">
+                <button class="btn-secondary" onclick="clearGPUSelection()">Clear Selection</button>
+                <button class="btn-danger" onclick="showMultiDisconnectModal()">
+                    <span class="disconnect-icon">⚡</span> Disconnect Selected
+                </button>
+            </div>
+        </div>
+    `;
+    
+    return toolbar;
+}
+
+/**
+ * Update GPU disconnect status UI
+ */
+function updateGPUDisconnectStatus(gpuId, status) {
+    const gpuCard = document.getElementById(`gpu-${gpuId}`);
+    if (!gpuCard) return;
+    
+    // Remove existing status classes
+    gpuCard.classList.remove('disconnecting', 'disconnect-completed', 'disconnect-failed');
+    
+    // Add status indicator
+    let statusIndicator = gpuCard.querySelector('.disconnect-status');
+    if (!statusIndicator) {
+        statusIndicator = document.createElement('div');
+        statusIndicator.className = 'disconnect-status';
+        gpuCard.appendChild(statusIndicator);
+    }
+    
+    switch (status) {
+        case 'starting':
+            gpuCard.classList.add('disconnecting');
+            statusIndicator.innerHTML = '<div class="status-spinner"></div> Disconnecting...';
+            statusIndicator.style.display = 'block';
+            break;
+            
+        case 'completed':
+            gpuCard.classList.add('disconnect-completed');
+            statusIndicator.innerHTML = '<span class="status-success">✓</span> Reconnected';
+            statusIndicator.style.display = 'block';
+            break;
+            
+        case 'failed':
+            gpuCard.classList.add('disconnect-failed');
+            statusIndicator.innerHTML = '<span class="status-error">✗</span> Disconnect Failed';
+            statusIndicator.style.display = 'block';
+            break;
+            
+        default:
+            statusIndicator.style.display = 'none';
+    }
+    
+    // Update disconnect button state
+    const disconnectBtn = gpuCard.querySelector('.disconnect-button');
+    if (disconnectBtn) {
+        disconnectBtn.disabled = (status === 'starting');
+    }
+}
+
+/**
+ * Update disconnect UI based on system capabilities
+ */
+function updateDisconnectUI() {
+    const capabilities = disconnectState.systemCapabilities;
+    if (!capabilities) return;
+    
+    // Update all disconnect buttons
+    document.querySelectorAll('.disconnect-button').forEach(btn => {
+        if (!capabilities.ready) {
+            btn.disabled = true;
+            btn.title = 'Disconnect unavailable: ' + (capabilities.warnings || []).join(', ');
+        } else {
+            btn.disabled = false;
+            btn.title = 'Disconnect GPU for fault tolerance testing';
+        }
+    });
+    
+    // Show system status if there are issues
+    if (!capabilities.ready) {
+        console.warn('GPU disconnect not ready:', capabilities.warnings);
+    }
+}
+
+/**
+ * Update disconnect button state
+ */
+function updateDisconnectButtonState(button, gpuId) {
+    const status = activeDisconnects.get(gpuId);
+    const capabilities = disconnectState.systemCapabilities;
+    
+    if (status && status.status === 'starting') {
+        button.disabled = true;
+        button.innerHTML = '<div class="btn-spinner"></div> Disconnecting...';
+    } else if (!capabilities || !capabilities.ready) {
+        button.disabled = true;
+        button.title = 'Disconnect unavailable';
+    } else {
+        button.disabled = false;
+        button.innerHTML = '<span class="disconnect-icon">⚡</span> Disconnect';
+        button.title = 'Disconnect GPU';
+    }
+}
+
+/**
+ * Format method name for display
+ */
+function formatMethodName(method) {
+    const names = {
+        'auto': 'Auto (Best Available)',
+        'slot': 'Slot Power Toggle',
+        'hot': 'Hot Reset',
+        'logical': 'Logical Remove/Rescan',
+        'nvidia': 'NVIDIA GPU Reset'
+    };
+    return names[method] || method.charAt(0).toUpperCase() + method.slice(1);
+}
+
+/**
+ * Get method description
+ */
+function getMethodDescription(method) {
+    const descriptions = {
+        'auto': 'Automatically select the most realistic method available on this system.',
+        'slot': 'Actually cut and restore slot power (closest to physical disconnect).',
+        'hot': 'Reset the PCIe link using upstream bridge controls.',
+        'logical': 'Software-only remove and re-scan (no hardware reset).',
+        'nvidia': 'Use NVIDIA driver reset functionality.'
+    };
+    return descriptions[method] || 'Custom disconnect method.';
+}
+
+/**
+ * Show notification
+ */
+function showNotification(message, type = 'info') {
+    // Create notification element
+    const notification = document.createElement('div');
+    notification.className = `notification notification-${type}`;
+    notification.innerHTML = `
+        <div class="notification-content">
+            <span class="notification-message">${message}</span>
+            <button class="notification-close" onclick="this.parentElement.parentElement.remove()">×</button>
+        </div>
+    `;
+    
+    // Add to page
+    let container = document.querySelector('.notification-container');
+    if (!container) {
+        container = document.createElement('div');
+        container.className = 'notification-container';
+        document.body.appendChild(container);
+    }
+    
+    container.appendChild(notification);
+    
+    // Auto-remove after delay
+    setTimeout(() => {
+        if (notification.parentElement) {
+            notification.remove();
+        }
+    }, 5000);
+}
+
+// Initialize when page loads
+document.addEventListener('DOMContentLoaded', () => {
+    // Small delay to ensure other scripts have loaded
+    setTimeout(initDisconnectControls, 100);
+});
+
+// Export functions for use by other modules
+window.addDisconnectButton = addDisconnectButton;
+window.addGPUSelectCheckbox = addGPUSelectCheckbox;
+window.showDisconnectModal = showDisconnectModal;
+window.showMultiDisconnectModal = showMultiDisconnectModal;
+window.clearGPUSelection = clearGPUSelection;
diff --git a/templates/index.html b/templates/index.html
index 13f237d..6f1fcc0 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -12,6 +12,7 @@
     
     <!-- Application Styles -->
     <link rel="stylesheet" href="/static/css/styles.css">
+    <link rel="stylesheet" href="/static/css/disconnect-controls.css">
 </head>
 <body>
     <div class="container">
@@ -82,10 +83,11 @@ <h1>🔥 GPU Hot</h1>
     </div>
 
     <!-- Application Scripts -->
-    <!-- Load in order: charts -> gpu-cards -> ui -> socket-handlers -> app -->
+    <!-- Load in order: charts -> gpu-cards -> ui -> disconnect -> socket-handlers -> app -->
     <script src="/static/js/charts.js"></script>
     <script src="/static/js/gpu-cards.js"></script>
     <script src="/static/js/ui.js"></script>
+    <script src="/static/js/gpu-disconnect.js"></script>
     <script src="/static/js/socket-handlers.js"></script>
     <script src="/static/js/app.js"></script>
 </body>

From 5d7e7fc2c537648df60ee4b319ccfa2ba8009825 Mon Sep 17 00:00:00 2001
From: SpyrosMouselinos <mouselinos.spur.kw@gmail.com>
Date: Thu, 23 Oct 2025 11:57:11 +0200
Subject: [PATCH 2/5] Disconnect

---
 README.md                                |  39 ++
 core/gpu_test_workloads.py               | 432 +++++++++++++++++++++++
 core/handlers.py                         | 100 ++++++
 requirements.txt                         |   3 +-
 test_quick_validation.py                 | 227 ++++++++++++
 tests/README.md                          | 260 +++++++++++---
 tests/test_gpu_disconnect_integration.py | 407 +++++++++++++++++++++
 7 files changed, 1422 insertions(+), 46 deletions(-)
 create mode 100644 core/gpu_test_workloads.py
 create mode 100644 test_quick_validation.py
 create mode 100644 tests/test_gpu_disconnect_integration.py

diff --git a/README.md b/README.md
index 758c55e..109db98 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,37 @@ POST /api/hub/gpu/{node_name}/{gpu_id}/disconnect
 POST /api/hub/gpu/disconnect-multiple
 ```
 
+### Integration Testing
+
+GPU Hot includes a comprehensive testing framework to validate disconnect functionality:
+
+**Run Full Test Suite:**
+```bash
+cd tests
+sudo python3 test_gpu_disconnect_integration.py
+```
+
+**Manual API Testing:**
+```bash
+# 1. Create GPU workload
+curl -X POST http://localhost:1312/api/gpu/workload/create \
+  -H "Content-Type: application/json" \
+  -d '{"gpu_id": 0, "workload_type": "compute_intensive", "duration": 30.0}'
+
+# 2. Start workload (use workload_id from response)
+curl -X POST http://localhost:1312/api/gpu/workload/{workload_id}/start
+
+# 3. Trigger disconnect while workload is running
+curl -X POST http://localhost:1312/api/gpu/0/disconnect \
+  -H "Content-Type": application/json" \
+  -d '{"method": "auto", "down_time": 5.0}'
+
+# 4. Check workload status (should be "interrupted" or "failed")
+curl http://localhost:1312/api/gpu/workload/{workload_id}/status
+```
+
+See [`tests/README.md`](tests/README.md) for detailed testing documentation.
+
 ---
 
 ## Configuration
@@ -166,6 +197,14 @@ GET  /api/hub/gpu/{node}/{gpu_id}/disconnect/methods  # Get methods for node GPU
 POST /api/hub/gpu/{node}/{gpu_id}/disconnect   # Disconnect GPU on specific node
 POST /api/hub/gpu/disconnect-multiple          # Multi-node batch disconnect
 GET  /api/hub/gpu/disconnect/status             # Hub-wide disconnect status
+
+# GPU Workload Testing API
+POST   /api/gpu/workload/create                # Create new GPU workload
+POST   /api/gpu/workload/{id}/start            # Start workload
+POST   /api/gpu/workload/{id}/stop             # Stop workload
+GET    /api/gpu/workload/{id}/status           # Get workload status
+GET    /api/gpu/workloads                      # List all workloads
+DELETE /api/gpu/workloads/cleanup              # Clean up completed workloads
 ```
 
 ### WebSocket
diff --git a/core/gpu_test_workloads.py b/core/gpu_test_workloads.py
new file mode 100644
index 0000000..f97f3d4
--- /dev/null
+++ b/core/gpu_test_workloads.py
@@ -0,0 +1,432 @@
+#!/usr/bin/env python3
+"""
+GPU Test Workloads - Generate various GPU operations for disconnect testing
+Uses PyTorch/CuPy for CUDA operations without requiring custom CUDA code
+"""
+
+import asyncio
+import logging
+import time
+import threading
+from datetime import datetime
+from typing import Optional, Dict, List
+from enum import Enum
+
+logger = logging.getLogger(__name__)
+
+# Try to import GPU libraries
+try:
+    import torch
+    TORCH_AVAILABLE = torch.cuda.is_available()
+except ImportError:
+    TORCH_AVAILABLE = False
+    logger.warning("PyTorch not available - GPU workload tests will be limited")
+
+try:
+    import cupy as cp
+    CUPY_AVAILABLE = True
+except ImportError:
+    CUPY_AVAILABLE = False
+    logger.warning("CuPy not available - using PyTorch for workloads")
+
+
+class WorkloadType(Enum):
+    """Types of GPU workloads for testing"""
+    MEMORY_STRESS = "memory_stress"
+    COMPUTE_INTENSIVE = "compute_intensive"
+    LONG_RUNNING = "long_running"
+    CONTINUOUS = "continuous"
+    MIXED = "mixed"
+
+
+class WorkloadStatus(Enum):
+    """Status of a running workload"""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    INTERRUPTED = "interrupted"
+
+
+class GPUWorkload:
+    """Represents a single GPU workload operation"""
+    
+    def __init__(self, workload_id: str, gpu_id: int, workload_type: WorkloadType, duration: float = 10.0):
+        self.workload_id = workload_id
+        self.gpu_id = gpu_id
+        self.workload_type = workload_type
+        self.duration = duration
+        self.status = WorkloadStatus.PENDING
+        self.start_time = None
+        self.end_time = None
+        self.error = None
+        self.progress = 0.0
+        self.iterations_completed = 0
+        self.expected_iterations = 100
+        self._stop_event = threading.Event()
+        self._thread = None
+    
+    def start(self):
+        """Start the workload in a background thread"""
+        if self.status != WorkloadStatus.PENDING:
+            raise RuntimeError(f"Workload {self.workload_id} already started")
+        
+        self.status = WorkloadStatus.RUNNING
+        self.start_time = datetime.now()
+        
+        # Run in separate thread to avoid blocking
+        self._thread = threading.Thread(target=self._run_workload, daemon=True)
+        self._thread.start()
+    
+    def stop(self):
+        """Stop the workload gracefully"""
+        self._stop_event.set()
+        if self._thread:
+            self._thread.join(timeout=5.0)
+    
+    def _run_workload(self):
+        """Execute the actual GPU workload"""
+        try:
+            if self.workload_type == WorkloadType.MEMORY_STRESS:
+                self._memory_stress()
+            elif self.workload_type == WorkloadType.COMPUTE_INTENSIVE:
+                self._compute_intensive()
+            elif self.workload_type == WorkloadType.LONG_RUNNING:
+                self._long_running()
+            elif self.workload_type == WorkloadType.CONTINUOUS:
+                self._continuous()
+            elif self.workload_type == WorkloadType.MIXED:
+                self._mixed()
+            else:
+                raise ValueError(f"Unknown workload type: {self.workload_type}")
+            
+            if not self._stop_event.is_set():
+                self.status = WorkloadStatus.COMPLETED
+                self.end_time = datetime.now()
+                logger.info(f"Workload {self.workload_id} completed successfully")
+            else:
+                self.status = WorkloadStatus.INTERRUPTED
+                self.end_time = datetime.now()
+                logger.info(f"Workload {self.workload_id} interrupted")
+                
+        except Exception as e:
+            self.status = WorkloadStatus.FAILED
+            self.end_time = datetime.now()
+            self.error = str(e)
+            logger.error(f"Workload {self.workload_id} failed: {e}")
+    
+    def _memory_stress(self):
+        """Allocate and deallocate GPU memory repeatedly"""
+        if TORCH_AVAILABLE:
+            logger.info(f"Starting memory stress test on GPU {self.gpu_id}")
+            device = torch.device(f'cuda:{self.gpu_id}')
+            
+            iteration = 0
+            start = time.time()
+            
+            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
+                try:
+                    # Allocate large tensors
+                    tensors = []
+                    for _ in range(10):
+                        if self._stop_event.is_set():
+                            break
+                        # Allocate ~100MB per tensor
+                        tensor = torch.randn(1024, 1024, 25, device=device)
+                        tensors.append(tensor)
+                    
+                    # Do some operations
+                    if tensors and not self._stop_event.is_set():
+                        result = torch.stack(tensors).sum()
+                        _ = result.cpu()  # Force computation
+                    
+                    # Deallocate
+                    del tensors
+                    torch.cuda.empty_cache()
+                    
+                    iteration += 1
+                    self.iterations_completed = iteration
+                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
+                    
+                    time.sleep(0.1)  # Brief pause between iterations
+                    
+                except RuntimeError as e:
+                    if "CUDA" in str(e) or "out of memory" in str(e):
+                        raise  # GPU-related errors should propagate
+                    logger.warning(f"Non-critical error in memory stress: {e}")
+        else:
+            # Fallback without GPU
+            logger.warning("PyTorch CUDA not available, simulating memory stress")
+            time.sleep(self.duration)
+            self.iterations_completed = 100
+    
+    def _compute_intensive(self):
+        """Perform compute-intensive matrix operations"""
+        if TORCH_AVAILABLE:
+            logger.info(f"Starting compute-intensive test on GPU {self.gpu_id}")
+            device = torch.device(f'cuda:{self.gpu_id}')
+            
+            iteration = 0
+            start = time.time()
+            
+            # Create large matrices
+            size = 2048
+            matrix_a = torch.randn(size, size, device=device)
+            matrix_b = torch.randn(size, size, device=device)
+            
+            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
+                try:
+                    # Matrix multiplication (compute-heavy)
+                    result = torch.matmul(matrix_a, matrix_b)
+                    
+                    # Additional operations
+                    result = torch.nn.functional.relu(result)
+                    result = torch.nn.functional.softmax(result, dim=1)
+                    
+                    # Force synchronization
+                    torch.cuda.synchronize(device)
+                    
+                    iteration += 1
+                    self.iterations_completed = iteration
+                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
+                    
+                except RuntimeError as e:
+                    if "CUDA" in str(e):
+                        raise
+                    logger.warning(f"Non-critical error in compute test: {e}")
+            
+            del matrix_a, matrix_b
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("PyTorch CUDA not available, simulating compute workload")
+            time.sleep(self.duration)
+            self.iterations_completed = 100
+    
+    def _long_running(self):
+        """Single long-running operation"""
+        if TORCH_AVAILABLE:
+            logger.info(f"Starting long-running test on GPU {self.gpu_id}")
+            device = torch.device(f'cuda:{self.gpu_id}')
+            
+            try:
+                # Create very large operation that takes time
+                size = 4096
+                matrix = torch.randn(size, size, device=device)
+                
+                start = time.time()
+                iterations = int(self.duration * 10)  # Adjust based on duration
+                
+                for i in range(iterations):
+                    if self._stop_event.is_set():
+                        break
+                    
+                    # Chain of operations
+                    result = torch.matmul(matrix, matrix)
+                    result = result + matrix
+                    result = torch.nn.functional.relu(result)
+                    matrix = result / result.max()
+                    
+                    torch.cuda.synchronize(device)
+                    
+                    self.iterations_completed = i + 1
+                    self.expected_iterations = iterations
+                    self.progress = min(100.0, (i + 1) / iterations * 100)
+                
+                del matrix, result
+                torch.cuda.empty_cache()
+                
+            except RuntimeError as e:
+                if "CUDA" in str(e):
+                    raise
+                logger.warning(f"Error in long-running test: {e}")
+        else:
+            logger.warning("PyTorch CUDA not available, simulating long-running workload")
+            time.sleep(self.duration)
+            self.iterations_completed = 100
+    
+    def _continuous(self):
+        """Continuous background operations"""
+        if TORCH_AVAILABLE:
+            logger.info(f"Starting continuous test on GPU {self.gpu_id}")
+            device = torch.device(f'cuda:{self.gpu_id}')
+            
+            iteration = 0
+            start = time.time()
+            
+            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
+                try:
+                    # Rapid small operations
+                    tensor = torch.randn(512, 512, device=device)
+                    result = tensor @ tensor.T
+                    _ = result.sum().item()
+                    
+                    iteration += 1
+                    self.iterations_completed = iteration
+                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
+                    
+                except RuntimeError as e:
+                    if "CUDA" in str(e):
+                        raise
+                    time.sleep(0.01)
+            
+            torch.cuda.empty_cache()
+        else:
+            logger.warning("PyTorch CUDA not available, simulating continuous workload")
+            time.sleep(self.duration)
+            self.iterations_completed = 100
+    
+    def _mixed(self):
+        """Mixed workload combining memory and compute"""
+        if TORCH_AVAILABLE:
+            logger.info(f"Starting mixed test on GPU {self.gpu_id}")
+            device = torch.device(f'cuda:{self.gpu_id}')
+            
+            iteration = 0
+            start = time.time()
+            
+            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
+                try:
+                    # Alternate between memory and compute
+                    if iteration % 2 == 0:
+                        # Memory operations
+                        tensors = [torch.randn(1024, 1024, device=device) for _ in range(5)]
+                        _ = torch.stack(tensors).sum()
+                        del tensors
+                    else:
+                        # Compute operations
+                        a = torch.randn(1024, 1024, device=device)
+                        b = torch.randn(1024, 1024, device=device)
+                        c = torch.matmul(a, b)
+                        _ = c.sum()
+                        del a, b, c
+                    
+                    torch.cuda.synchronize(device)
+                    torch.cuda.empty_cache()
+                    
+                    iteration += 1
+                    self.iterations_completed = iteration
+                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
+                    
+                    time.sleep(0.1)
+                    
+                except RuntimeError as e:
+                    if "CUDA" in str(e):
+                        raise
+                    logger.warning(f"Error in mixed workload: {e}")
+            
+        else:
+            logger.warning("PyTorch CUDA not available, simulating mixed workload")
+            time.sleep(self.duration)
+            self.iterations_completed = 100
+    
+    def get_status(self) -> Dict:
+        """Get current workload status"""
+        duration = None
+        if self.start_time:
+            end = self.end_time or datetime.now()
+            duration = (end - self.start_time).total_seconds()
+        
+        return {
+            'workload_id': self.workload_id,
+            'gpu_id': self.gpu_id,
+            'type': self.workload_type.value,
+            'status': self.status.value,
+            'progress': self.progress,
+            'iterations_completed': self.iterations_completed,
+            'expected_iterations': self.expected_iterations,
+            'duration_seconds': duration,
+            'error': self.error,
+            'start_time': self.start_time.isoformat() if self.start_time else None,
+            'end_time': self.end_time.isoformat() if self.end_time else None
+        }
+
+
+class GPUWorkloadManager:
+    """Manages multiple GPU workloads"""
+    
+    def __init__(self):
+        self.workloads: Dict[str, GPUWorkload] = {}
+        self.workload_counter = 0
+    
+    def create_workload(
+        self, 
+        gpu_id: int, 
+        workload_type: WorkloadType = WorkloadType.COMPUTE_INTENSIVE,
+        duration: float = 10.0
+    ) -> str:
+        """Create a new workload"""
+        self.workload_counter += 1
+        workload_id = f"workload_{self.workload_counter}_{int(time.time())}"
+        
+        workload = GPUWorkload(workload_id, gpu_id, workload_type, duration)
+        self.workloads[workload_id] = workload
+        
+        logger.info(f"Created workload {workload_id} for GPU {gpu_id}: {workload_type.value}")
+        return workload_id
+    
+    def start_workload(self, workload_id: str):
+        """Start a pending workload"""
+        if workload_id not in self.workloads:
+            raise ValueError(f"Workload {workload_id} not found")
+        
+        workload = self.workloads[workload_id]
+        workload.start()
+        logger.info(f"Started workload {workload_id}")
+    
+    def stop_workload(self, workload_id: str):
+        """Stop a running workload"""
+        if workload_id not in self.workloads:
+            raise ValueError(f"Workload {workload_id} not found")
+        
+        workload = self.workloads[workload_id]
+        workload.stop()
+        logger.info(f"Stopped workload {workload_id}")
+    
+    def get_workload_status(self, workload_id: str) -> Dict:
+        """Get status of a specific workload"""
+        if workload_id not in self.workloads:
+            raise ValueError(f"Workload {workload_id} not found")
+        
+        return self.workloads[workload_id].get_status()
+    
+    def get_all_workloads(self) -> List[Dict]:
+        """Get status of all workloads"""
+        return [w.get_status() for w in self.workloads.values()]
+    
+    def get_active_workloads(self) -> List[Dict]:
+        """Get status of currently running workloads"""
+        return [
+            w.get_status() 
+            for w in self.workloads.values() 
+            if w.status == WorkloadStatus.RUNNING
+        ]
+    
+    def cleanup_completed(self):
+        """Remove completed/failed workloads older than 5 minutes"""
+        cutoff = time.time() - 300  # 5 minutes ago
+        to_remove = []
+        
+        for wid, workload in self.workloads.items():
+            if workload.status in [WorkloadStatus.COMPLETED, WorkloadStatus.FAILED, WorkloadStatus.INTERRUPTED]:
+                if workload.end_time:
+                    end_timestamp = workload.end_time.timestamp()
+                    if end_timestamp < cutoff:
+                        to_remove.append(wid)
+        
+        for wid in to_remove:
+            del self.workloads[wid]
+        
+        if to_remove:
+            logger.info(f"Cleaned up {len(to_remove)} old workloads")
+    
+    def stop_all(self):
+        """Stop all running workloads"""
+        for workload in self.workloads.values():
+            if workload.status == WorkloadStatus.RUNNING:
+                workload.stop()
+        
+        logger.info("Stopped all workloads")
+
+
+# Global workload manager instance
+workload_manager = GPUWorkloadManager()
diff --git a/core/handlers.py b/core/handlers.py
index 18f4d50..8d481df 100644
--- a/core/handlers.py
+++ b/core/handlers.py
@@ -10,6 +10,7 @@
 from pydantic import BaseModel
 from . import config
 from .gpu_disconnect import disconnect_gpu, disconnect_multiple_gpus, get_available_methods, GPUDisconnectError
+from .gpu_test_workloads import workload_manager, WorkloadType
 
 logger = logging.getLogger(__name__)
 
@@ -29,6 +30,12 @@ class MultiDisconnectRequest(BaseModel):
     down_time: float = 5.0
 
 
+class WorkloadRequest(BaseModel):
+    gpu_id: int
+    workload_type: str = "compute_intensive"
+    duration: float = 10.0
+
+
 def register_handlers(app, monitor):
     """Register FastAPI WebSocket handlers"""
     
@@ -141,6 +148,99 @@ async def get_disconnect_status():
         except Exception as e:
             logger.error(f"Error checking disconnect status: {e}")
             raise HTTPException(status_code=500, detail=str(e))
+    
+    # GPU Workload Testing API Endpoints
+    @app.post("/api/gpu/workload/create")
+    async def create_workload(request: WorkloadRequest):
+        """Create a new GPU workload for testing"""
+        try:
+            workload_id = workload_manager.create_workload(
+                gpu_id=request.gpu_id,
+                workload_type=WorkloadType(request.workload_type),
+                duration=request.duration
+            )
+            
+            return {
+                "workload_id": workload_id,
+                "gpu_id": request.gpu_id,
+                "workload_type": request.workload_type,
+                "duration": request.duration,
+                "status": "created"
+            }
+            
+        except Exception as e:
+            logger.error(f"Error creating workload: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.post("/api/gpu/workload/{workload_id}/start")
+    async def start_workload(workload_id: str):
+        """Start a GPU workload"""
+        try:
+            workload_manager.start_workload(workload_id)
+            status = workload_manager.get_workload_status(workload_id)
+            return status
+            
+        except ValueError as e:
+            raise HTTPException(status_code=404, detail=str(e))
+        except Exception as e:
+            logger.error(f"Error starting workload: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.post("/api/gpu/workload/{workload_id}/stop")
+    async def stop_workload(workload_id: str):
+        """Stop a running GPU workload"""
+        try:
+            workload_manager.stop_workload(workload_id)
+            status = workload_manager.get_workload_status(workload_id)
+            return status
+            
+        except ValueError as e:
+            raise HTTPException(status_code=404, detail=str(e))
+        except Exception as e:
+            logger.error(f"Error stopping workload: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.get("/api/gpu/workload/{workload_id}/status")
+    async def get_workload_status_api(workload_id: str):
+        """Get status of a specific workload"""
+        try:
+            status = workload_manager.get_workload_status(workload_id)
+            return status
+            
+        except ValueError as e:
+            raise HTTPException(status_code=404, detail=str(e))
+        except Exception as e:
+            logger.error(f"Error getting workload status: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.get("/api/gpu/workloads")
+    async def get_all_workloads():
+        """Get status of all workloads"""
+        try:
+            workloads = workload_manager.get_all_workloads()
+            active = workload_manager.get_active_workloads()
+            
+            return {
+                "total_workloads": len(workloads),
+                "active_workloads": len(active),
+                "workloads": workloads,
+                "active": active
+            }
+            
+        except Exception as e:
+            logger.error(f"Error getting workloads: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    
+    @app.delete("/api/gpu/workloads/cleanup")
+    async def cleanup_workloads():
+        """Clean up completed workloads"""
+        try:
+            workload_manager.cleanup_completed()
+            return {"status": "ok", "message": "Cleaned up completed workloads"}
+            
+        except Exception as e:
+            logger.error(f"Error cleaning up workloads: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
 
 
 async def monitor_loop(monitor, connections):
diff --git a/requirements.txt b/requirements.txt
index a7b7cc4..6699dc2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ psutil==5.9.6
 nvidia-ml-py==13.580.82
 requests==2.31.0
 websocket-client==1.6.3
-aiohttp==3.9.1
\ No newline at end of file
+aiohttp==3.9.1
+torch==2.1.0
\ No newline at end of file
diff --git a/test_quick_validation.py b/test_quick_validation.py
new file mode 100644
index 0000000..d76cdf6
--- /dev/null
+++ b/test_quick_validation.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+"""
+Quick validation script - Test GPU disconnect functionality
+Run this to verify the implementation works on your system
+"""
+
+import sys
+import time
+import asyncio
+
+print("="*80)
+print("GPU DISCONNECT FUNCTIONALITY - QUICK VALIDATION TEST")
+print("="*80)
+print()
+
+# Check 1: Verify all modules can be imported
+print("✓ Step 1: Checking module imports...")
+try:
+    from core.gpu_disconnect import gpu_disconnector, DisconnectMethod
+    print("  ✓ GPU disconnect module loaded")
+except ImportError as e:
+    print(f"  ✗ Failed to import gpu_disconnect: {e}")
+    sys.exit(1)
+
+try:
+    from core.gpu_test_workloads import workload_manager, WorkloadType, TORCH_AVAILABLE
+    print("  ✓ GPU workload module loaded")
+except ImportError as e:
+    print(f"  ✗ Failed to import gpu_test_workloads: {e}")
+    sys.exit(1)
+
+try:
+    from tests.test_gpu_disconnect_integration import (
+        create_basic_disconnect_test,
+        create_standard_test_suite
+    )
+    print("  ✓ Integration test module loaded")
+except ImportError as e:
+    print(f"  ✗ Failed to import integration tests: {e}")
+    sys.exit(1)
+
+print()
+
+# Check 2: Verify PyTorch availability
+print("✓ Step 2: Checking GPU libraries...")
+if TORCH_AVAILABLE:
+    import torch
+    gpu_count = torch.cuda.device_count()
+    print(f"  ✓ PyTorch CUDA available: {gpu_count} GPU(s) detected")
+    if gpu_count > 0:
+        for i in range(gpu_count):
+            name = torch.cuda.get_device_name(i)
+            print(f"    - GPU {i}: {name}")
+else:
+    print("  ⚠ PyTorch CUDA not available")
+    print("    Install with: pip install torch --index-url https://download.pytorch.org/whl/cu118")
+    print("    Continuing with limited functionality...")
+
+print()
+
+# Check 3: Test workload creation
+print("✓ Step 3: Testing workload creation...")
+try:
+    workload_id = workload_manager.create_workload(
+        gpu_id=0,
+        workload_type=WorkloadType.COMPUTE_INTENSIVE,
+        duration=5.0
+    )
+    print(f"  ✓ Created test workload: {workload_id}")
+    
+    # Get status
+    status = workload_manager.get_workload_status(workload_id)
+    print(f"  ✓ Workload status: {status['status']}")
+    
+except Exception as e:
+    print(f"  ✗ Failed to create workload: {e}")
+    sys.exit(1)
+
+print()
+
+# Check 4: Test disconnect capability detection
+print("✓ Step 4: Checking disconnect capabilities...")
+async def check_disconnect():
+    try:
+        methods = await gpu_disconnector.get_available_methods(0)
+        print(f"  ✓ Available disconnect methods: {', '.join(methods)}")
+        return True
+    except Exception as e:
+        print(f"  ⚠ Could not detect methods: {e}")
+        print("    This is expected if not running as root")
+        return False
+
+has_disconnect = asyncio.run(check_disconnect())
+
+print()
+
+# Check 5: Run a simple test (if PyTorch available)
+if TORCH_AVAILABLE and gpu_count > 0:
+    print("✓ Step 5: Running quick GPU workload test...")
+    try:
+        # Start the workload
+        workload_manager.start_workload(workload_id)
+        print(f"  ✓ Started workload on GPU 0")
+        
+        # Monitor for a few seconds
+        for i in range(3):
+            time.sleep(1)
+            status = workload_manager.get_workload_status(workload_id)
+            print(f"  ✓ Progress: {status['progress']:.1f}% "
+                  f"({status['iterations_completed']} iterations, "
+                  f"status: {status['status']})")
+        
+        # Stop it
+        workload_manager.stop_workload(workload_id)
+        final_status = workload_manager.get_workload_status(workload_id)
+        print(f"  ✓ Workload stopped: {final_status['status']}")
+        
+    except Exception as e:
+        print(f"  ✗ Workload test failed: {e}")
+        import traceback
+        traceback.print_exc()
+else:
+    print("⊘ Step 5: Skipping workload test (PyTorch/CUDA not available)")
+
+print()
+
+# Check 6: Test integration test creation
+print("✓ Step 6: Testing integration test framework...")
+try:
+    test = create_basic_disconnect_test(gpu_id=0)
+    print(f"  ✓ Created test: {test.name}")
+    print(f"    Description: {test.description}")
+    print(f"    Workload: {test.workload_type.value}")
+    print(f"    Duration: {test.workload_duration}s")
+except Exception as e:
+    print(f"  ✗ Failed to create integration test: {e}")
+    sys.exit(1)
+
+print()
+
+# Summary
+print("="*80)
+print("VALIDATION SUMMARY")
+print("="*80)
+print()
+
+all_checks = [
+    ("Module imports", True),
+    ("PyTorch CUDA", TORCH_AVAILABLE),
+    ("Workload creation", True),
+    ("Disconnect detection", has_disconnect),
+    ("GPU workload execution", TORCH_AVAILABLE and gpu_count > 0),
+    ("Integration test framework", True)
+]
+
+passed = sum(1 for _, result in all_checks if result)
+total = len(all_checks)
+
+for check_name, result in all_checks:
+    symbol = "✓" if result else "⚠" if "PyTorch" in check_name else "✗"
+    status = "PASS" if result else "WARN" if "PyTorch" in check_name else "FAIL"
+    print(f"{symbol} {check_name}: {status}")
+
+print()
+print(f"Results: {passed}/{total} checks passed")
+print()
+
+if not TORCH_AVAILABLE:
+    print("⚠ WARNING: PyTorch CUDA not available")
+    print("  The framework is installed but cannot run GPU workloads")
+    print("  Install PyTorch with CUDA:")
+    print("  pip install torch --index-url https://download.pytorch.org/whl/cu118")
+    print()
+
+if not has_disconnect:
+    print("⚠ WARNING: Disconnect capabilities limited")
+    print("  This is normal if not running as root or in WSL2")
+    print("  For full disconnect testing, run with sudo on bare-metal Linux")
+    print()
+
+# Next steps
+print("="*80)
+print("NEXT STEPS")
+print("="*80)
+print()
+print("1. Start the application:")
+print("   docker-compose up --build")
+print()
+print("2. Test via Web UI:")
+print("   Open http://localhost:1312")
+print("   - Click disconnect button on any GPU")
+print("   - Select method and duration")
+print()
+print("3. Run full integration tests:")
+print("   cd tests")
+print("   sudo python3 test_gpu_disconnect_integration.py")
+print()
+print("4. Test via API:")
+print("   # Create workload")
+print("   curl -X POST http://localhost:1312/api/gpu/workload/create \\")
+print("     -H 'Content-Type: application/json' \\")
+print("     -d '{\"gpu_id\": 0, \"workload_type\": \"compute_intensive\", \"duration\": 30}'")
+print()
+print("   # Start workload (use workload_id from response)")
+print("   curl -X POST http://localhost:1312/api/gpu/workload/<ID>/start")
+print()
+print("   # Trigger disconnect while running")
+print("   curl -X POST http://localhost:1312/api/gpu/0/disconnect \\")
+print("     -H 'Content-Type: application/json' \\")
+print("     -d '{\"method\": \"auto\", \"down_time\": 5}'")
+print()
+print("   # Check workload status (should be interrupted)")
+print("   curl http://localhost:1312/api/gpu/workload/<ID>/status")
+print()
+print("="*80)
+print()
+
+if passed == total:
+    print("✓ ALL SYSTEMS GO! The implementation is ready to use.")
+    sys.exit(0)
+elif passed >= total - 1:
+    print("⚠ MOSTLY READY - Some optional features unavailable")
+    sys.exit(0)
+else:
+    print("✗ ISSUES DETECTED - Please review warnings above")
+    sys.exit(1)
+
diff --git a/tests/README.md b/tests/README.md
index 9bdb0c0..2dfa08d 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,73 +1,243 @@
-# GPU Hot - Load Testing (FastAPI + AsyncIO)
+# GPU Disconnect Integration Tests
 
-Simple load testing for multi-node GPU monitoring with realistic async patterns.
+This directory contains comprehensive integration tests for GPU disconnect functionality.
 
 ## Quick Start
 
+### Run Full Test Suite
 ```bash
 cd tests
-docker-compose -f docker-compose.test.yml up
+python3 test_gpu_disconnect_integration.py
 ```
 
-Open http://localhost:1312 to see the dashboard.
+This will run a complete suite of disconnect tests including:
+- Basic disconnect during compute workload
+- Memory stress test with disconnect
+- Immediate disconnect after workload start  
+- Continuous workload disconnect
 
-## Architecture
+## Requirements
 
-- **FastAPI + AsyncIO**: Modern async Python for better performance
-- **Native WebSockets**: No Socket.IO overhead, direct WebSocket protocol
-- **Concurrent Mock Nodes**: Multiple nodes running in parallel
-- **Realistic GPU Patterns**: Training jobs with epochs, warmup, validation
+### System Requirements
+- **Linux** with PCI sysfs (`/sys/bus/pci/devices`)
+- **Root privileges** (for actual GPU disconnect)
+- **NVIDIA GPU** with drivers installed
+- **PyTorch with CUDA** support
 
-## Load Test Presets
+### Python Dependencies
+```bash
+pip install torch --index-url https://download.pytorch.org/whl/cu118
+```
+
+Or use the Docker container which includes all dependencies.
+
+## Test Components
+
+### 1. GPU Workload Generator (`core/gpu_test_workloads.py`)
+Generates various GPU workloads for testing:
+
+**Workload Types:**
+- `MEMORY_STRESS` - Rapid memory allocation/deallocation
+- `COMPUTE_INTENSIVE` - Matrix multiplications and heavy compute
+- `LONG_RUNNING` - Single long operation with many iterations
+- `CONTINUOUS` - Rapid small operations in tight loop
+- `MIXED` - Combination of memory and compute operations
+
+### 2. Integration Test Framework (`test_gpu_disconnect_integration.py`)
+Orchestrates complete test scenarios:
+
+**Test Phases:**
+1. **Start Workload** - Begin GPU operation
+2. **Monitor** - Track workload progress
+3. **Disconnect** - Trigger GPU disconnect
+4. **Validate** - Verify expected behavior
+
+**Expected Results:**
+- Workload interrupted or fails during disconnect
+- CUDA errors captured appropriately
+- GPU unavailable during disconnect period
+- GPU recovers after reconnect
+
+### 3. Pre-configured Test Scenarios
+Ready-to-use test configurations:
+
+```python
+from tests.test_gpu_disconnect_integration import (
+    create_basic_disconnect_test,
+    create_memory_stress_disconnect_test,
+    create_immediate_disconnect_test,
+    create_continuous_workload_test,
+    create_standard_test_suite
+)
+
+# Run single test
+test = create_basic_disconnect_test(gpu_id=0)
+result = await test.run()
+
+# Run full suite
+suite = create_standard_test_suite(gpu_id=0)
+results = await suite.run_all()
+```
+
+## Manual Testing with API
 
-Edit `docker-compose.test.yml` and uncomment the preset you want:
+You can also test via the REST API when the application is running:
 
-### LIGHT (3 nodes, 14 GPUs)
-Good for development and quick testing.
-```yaml
-- NODES=2,4,8
-- NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122
+### 1. Create and Start Workload
+```bash
+# Create workload
+curl -X POST http://localhost:1312/api/gpu/workload/create \
+  -H "Content-Type: application/json" \
+  -d '{"gpu_id": 0, "workload_type": "compute_intensive", "duration": 30.0}'
+
+# Response includes workload_id
+# {"workload_id": "workload_1_1234567890", ...}
+
+# Start the workload
+curl -X POST http://localhost:1312/api/gpu/workload/workload_1_1234567890/start
 ```
 
-### MEDIUM (8 nodes, 64 GPUs) ⭐ Default
-Realistic medium-sized cluster.
-```yaml
-- NODES=8,8,8,8,8,8,8,8
-- NODE_URLS=http://mock-cluster:13120,...,http://mock-cluster:13127
+### 2. Monitor Workload
+```bash
+# Check workload status
+curl http://localhost:1312/api/gpu/workload/workload_1_1234567890/status
+
+# List all workloads
+curl http://localhost:1312/api/gpu/workloads
 ```
 
-### HEAVY (20 nodes, 160 GPUs)
-Stress test for large production environments.
-```yaml
-- NODES=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-- NODE_URLS=http://mock-cluster:13120,...,http://mock-cluster:13139
+### 3. Trigger Disconnect During Workload
+```bash
+# While workload is running, trigger disconnect
+curl -X POST http://localhost:1312/api/gpu/0/disconnect \
+  -H "Content-Type: application/json" \
+  -d '{"method": "auto", "down_time": 5.0}'
 ```
 
-## What's Simulated
+### 4. Check Results
+```bash
+# Check final workload status
+curl http://localhost:1312/api/gpu/workload/workload_1_1234567890/status
 
-- **Realistic GPU patterns**: Training jobs with epochs, warmup, validation
-- **Idle + busy GPUs**: ~40% utilization typical of real clusters
-- **Stable memory**: Memory allocated at job start, stays constant
-- **Clock speeds**: Proper P-states (P0/P2/P8)
-- **Data loading dips**: Periodic utilization drops
-- **Temperature correlation**: Realistic thermal behavior
+# Expected: status should be "interrupted" or "failed"
+```
 
-## Files
+## Test Validation Criteria
 
-- `test_cluster.py` - Mock GPU node with realistic patterns (FastAPI + AsyncIO)
-- `docker-compose.test.yml` - Test stack with preset configurations
-- `Dockerfile.test` - Container for mock nodes (FastAPI dependencies)
+### Successful Disconnect Test:
+✅ Workload starts successfully  
+✅ Disconnect operation completes  
+✅ Workload is interrupted/fails during disconnect  
+✅ GPU becomes unavailable (nvidia-smi shows error)  
+✅ GPU recovers after reconnect  
+✅ New operations can be scheduled after recovery  
 
-## Performance Benefits
+### Expected Behaviors:
 
-- **20-40% latency reduction** with true async/await
-- **2-3x more concurrent connections** supported
-- **Better resource utilization** for hub mode aggregation
-- **Sub-500ms latency** consistently achieved
+**During Disconnect:**
+- Running CUDA operations fail with errors
+- New operations cannot be scheduled
+- `nvidia-smi` reports GPU unavailable
+- Workload status changes to `interrupted` or `failed`
 
-## Rebuild After Changes
+**After Reconnect:**
+- GPU reappears in system
+- New workloads can be created
+- Operations complete successfully
+- No memory leaks or resource issues
 
+## Troubleshooting
+
+### "PyTorch CUDA not available"
+Install PyTorch with CUDA support:
 ```bash
-docker-compose -f docker-compose.test.yml down
-docker-compose -f docker-compose.test.yml up --build
+pip install torch --index-url https://download.pytorch.org/whl/cu118
 ```
+
+### "Permission denied" during disconnect
+Tests require root privileges for actual GPU disconnect:
+```bash
+sudo python3 test_gpu_disconnect_integration.py
+```
+
+### "Workload completed despite disconnect"
+This indicates the disconnect didn't actually affect the GPU. Possible causes:
+- Insufficient privileges (need root)
+- WSL2 limitations (use bare metal Linux)
+- Disconnect method not supported on platform
+
+### Tests pass but you want to verify manually
+Check system logs during test:
+```bash
+# Terminal 1: Run tests
+sudo python3 test_gpu_disconnect_integration.py
+
+# Terminal 2: Watch GPU status
+watch -n 0.5 nvidia-smi
+
+# Terminal 3: Monitor kernel messages
+sudo dmesg -w | grep -i gpu
+```
+
+## Advanced Usage
+
+### Custom Test Scenario
+```python
+from tests.test_gpu_disconnect_integration import DisconnectTestScenario
+from core.gpu_test_workloads import WorkloadType
+
+# Create custom test
+test = DisconnectTestScenario(
+    test_id="custom_test_1",
+    name="Custom Stress Test",
+    description="My custom disconnect scenario",
+    gpu_id=0,
+    workload_type=WorkloadType.MEMORY_STRESS,
+    workload_duration=60.0,      # 60 second workload
+    disconnect_delay=10.0,        # Disconnect after 10s
+    disconnect_method="logical",  # Force logical method
+    disconnect_duration=15.0      # Keep disconnected for 15s
+)
+
+result = await test.run()
+print(result)
+```
+
+### Multi-GPU Testing
+```python
+# Test on different GPUs
+suite = DisconnectTestSuite("Multi-GPU Tests")
+
+for gpu_id in [0, 1, 2, 3]:
+    test = create_basic_disconnect_test(gpu_id=gpu_id)
+    suite.add_test(test)
+
+results = await suite.run_all()
+```
+
+## CI/CD Integration
+
+For automated testing in CI/CD pipelines:
+
+```bash
+# Run tests with JSON output
+python3 test_gpu_disconnect_integration.py --json > results.json
+
+# Check exit code
+if [ $? -eq 0 ]; then
+    echo "All tests passed"
+else
+    echo "Tests failed"
+    exit 1
+fi
+```
+
+## WSL2 / Limited Environments
+
+In WSL2 or environments without full PCI access, tests will:
+- Execute workloads successfully ✅
+- Attempt disconnect operations ✅
+- Report permission errors (expected) ⚠️
+- Still validate UI/API functionality ✅
+
+This allows partial validation even without hardware disconnect capability.
\ No newline at end of file
diff --git a/tests/test_gpu_disconnect_integration.py b/tests/test_gpu_disconnect_integration.py
new file mode 100644
index 0000000..1ed1e51
--- /dev/null
+++ b/tests/test_gpu_disconnect_integration.py
@@ -0,0 +1,407 @@
+#!/usr/bin/env python3
+"""
+GPU Disconnect Integration Tests
+Orchestrates workloads, triggers disconnects, and validates results
+"""
+
+import asyncio
+import logging
+import time
+from datetime import datetime
+from typing import Dict, List, Optional
+from enum import Enum
+
+import sys
+sys.path.insert(0, '../')
+
+from core.gpu_test_workloads import (
+    WorkloadType, WorkloadStatus, workload_manager, TORCH_AVAILABLE
+)
+from core.gpu_disconnect import gpu_disconnector, DisconnectMethod, GPUDisconnectError
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TestStatus(Enum):
+    """Status of a disconnect test"""
+    PENDING = "pending"
+    RUNNING = "running"
+    PASSED = "passed"
+    FAILED = "failed"
+    ERROR = "error"
+
+
+class DisconnectTestScenario:
+    """Represents a single disconnect test scenario"""
+    
+    def __init__(
+        self,
+        test_id: str,
+        name: str,
+        description: str,
+        gpu_id: int,
+        workload_type: WorkloadType = WorkloadType.COMPUTE_INTENSIVE,
+        workload_duration: float = 15.0,
+        disconnect_delay: float = 3.0,
+        disconnect_method: str = "auto",
+        disconnect_duration: float = 5.0
+    ):
+        self.test_id = test_id
+        self.name = name
+        self.description = description
+        self.gpu_id = gpu_id
+        self.workload_type = workload_type
+        self.workload_duration = workload_duration
+        self.disconnect_delay = disconnect_delay
+        self.disconnect_method = disconnect_method
+        self.disconnect_duration = disconnect_duration
+        
+        self.status = TestStatus.PENDING
+        self.start_time = None
+        self.end_time = None
+        self.workload_id = None
+        self.workload_status_before = None
+        self.workload_status_during = None
+        self.workload_status_after = None
+        self.disconnect_result = None
+        self.errors = []
+        self.logs = []
+    
+    async def run(self) -> Dict:
+        """Execute the test scenario"""
+        self.status = TestStatus.RUNNING
+        self.start_time = datetime.now()
+        self.log(f"Starting test: {self.name}")
+        
+        try:
+            # Phase 1: Start GPU workload
+            self.log(f"Phase 1: Starting {self.workload_type.value} workload on GPU {self.gpu_id}")
+            self.workload_id = workload_manager.create_workload(
+                gpu_id=self.gpu_id,
+                workload_type=self.workload_type,
+                duration=self.workload_duration
+            )
+            workload_manager.start_workload(self.workload_id)
+            
+            # Wait a bit for workload to get going
+            await asyncio.sleep(1.0)
+            self.workload_status_before = workload_manager.get_workload_status(self.workload_id)
+            self.log(f"Workload started: {self.workload_status_before['iterations_completed']} iterations")
+            
+            # Phase 2: Wait before disconnect
+            if self.disconnect_delay > 0:
+                self.log(f"Phase 2: Waiting {self.disconnect_delay}s before disconnect")
+                await asyncio.sleep(self.disconnect_delay)
+                self.workload_status_during = workload_manager.get_workload_status(self.workload_id)
+                self.log(f"Workload progress: {self.workload_status_during['progress']:.1f}% "
+                        f"({self.workload_status_during['iterations_completed']} iterations)")
+            
+            # Phase 3: Trigger disconnect
+            self.log(f"Phase 3: Triggering GPU {self.gpu_id} disconnect using {self.disconnect_method}")
+            disconnect_start = time.time()
+            
+            try:
+                self.disconnect_result = await gpu_disconnector.disconnect_gpu(
+                    gpu_index=self.gpu_id,
+                    method=DisconnectMethod(self.disconnect_method),
+                    down_time=self.disconnect_duration
+                )
+                disconnect_elapsed = time.time() - disconnect_start
+                self.log(f"Disconnect completed in {disconnect_elapsed:.2f}s: {self.disconnect_result.get('message', 'OK')}")
+                
+            except GPUDisconnectError as e:
+                self.log(f"Disconnect operation failed: {e}", level=logging.ERROR)
+                self.errors.append(f"Disconnect failed: {e}")
+                self.disconnect_result = {'success': False, 'error': str(e)}
+            
+            # Phase 4: Check workload status after disconnect
+            await asyncio.sleep(1.0)
+            self.workload_status_after = workload_manager.get_workload_status(self.workload_id)
+            self.log(f"Workload final status: {self.workload_status_after['status']} "
+                    f"({self.workload_status_after['iterations_completed']} iterations)")
+            
+            # Phase 5: Validate results
+            self.log("Phase 5: Validating test results")
+            validation = self.validate_results()
+            
+            if validation['passed']:
+                self.status = TestStatus.PASSED
+                self.log("✓ Test PASSED")
+            else:
+                self.status = TestStatus.FAILED
+                self.log(f"✗ Test FAILED: {validation['reason']}")
+                self.errors.append(validation['reason'])
+            
+        except Exception as e:
+            self.status = TestStatus.ERROR
+            self.log(f"Test ERROR: {e}", level=logging.ERROR)
+            self.errors.append(str(e))
+            
+        finally:
+            self.end_time = datetime.now()
+            # Clean up workload
+            if self.workload_id:
+                try:
+                    workload_manager.stop_workload(self.workload_id)
+                except:
+                    pass
+        
+        return self.get_result()
+    
+    def validate_results(self) -> Dict:
+        """Validate that the test behaved as expected"""
+        # Expected behavior: workload should be interrupted or fail during disconnect
+        
+        if not self.workload_status_after:
+            return {'passed': False, 'reason': 'No workload status available after disconnect'}
+        
+        # Check if disconnect succeeded
+        if not self.disconnect_result or not self.disconnect_result.get('success'):
+            # If disconnect failed, test is inconclusive but not necessarily failed
+            # (might be testing in an environment without proper permissions)
+            return {
+                'passed': True,  # Pass but note the limitation
+                'reason': 'Disconnect operation failed (expected in limited environments)',
+                'note': 'Could not validate actual GPU disconnect behavior'
+            }
+        
+        # If disconnect succeeded, workload should be interrupted or failed
+        workload_final_status = self.workload_status_after['status']
+        
+        # Expected: workload interrupted, failed, or didn't complete all iterations
+        if workload_final_status in ['interrupted', 'failed']:
+            return {
+                'passed': True,
+                'reason': f'Workload correctly {workload_final_status} during disconnect'
+            }
+        
+        # Check if workload completed but didn't finish all expected iterations
+        if workload_final_status == 'completed':
+            completed = self.workload_status_after['iterations_completed']
+            expected = self.workload_status_after.get('expected_iterations', 100)
+            
+            if completed < expected:
+                return {
+                    'passed': True,
+                    'reason': f'Workload interrupted early ({completed}/{expected} iterations)'
+                }
+            else:
+                return {
+                    'passed': False,
+                    'reason': 'Workload completed all iterations despite disconnect (disconnect may not have affected GPU)'
+                }
+        
+        return {
+            'passed': True,
+            'reason': 'Test completed with expected behavior'
+        }
+    
+    def log(self, message: str, level=logging.INFO):
+        """Log a message"""
+        timestamp = datetime.now().isoformat()
+        log_entry = f"[{timestamp}] {message}"
+        self.logs.append(log_entry)
+        logger.log(level, f"[{self.test_id}] {message}")
+    
+    def get_result(self) -> Dict:
+        """Get test results"""
+        duration = None
+        if self.start_time and self.end_time:
+            duration = (self.end_time - self.start_time).total_seconds()
+        
+        return {
+            'test_id': self.test_id,
+            'name': self.name,
+            'description': self.description,
+            'status': self.status.value,
+            'duration_seconds': duration,
+            'gpu_id': self.gpu_id,
+            'workload_type': self.workload_type.value,
+            'disconnect_method': self.disconnect_method,
+            'workload_before': self.workload_status_before,
+            'workload_during': self.workload_status_during,
+            'workload_after': self.workload_status_after,
+            'disconnect_result': self.disconnect_result,
+            'errors': self.errors,
+            'logs': self.logs,
+            'start_time': self.start_time.isoformat() if self.start_time else None,
+            'end_time': self.end_time.isoformat() if self.end_time else None
+        }
+
+
+class DisconnectTestSuite:
+    """Collection of test scenarios"""
+    
+    def __init__(self, suite_name: str):
+        self.suite_name = suite_name
+        self.tests: List[DisconnectTestScenario] = []
+        self.start_time = None
+        self.end_time = None
+    
+    def add_test(self, test: DisconnectTestScenario):
+        """Add a test to the suite"""
+        self.tests.append(test)
+    
+    async def run_all(self) -> Dict:
+        """Run all tests in the suite"""
+        self.start_time = datetime.now()
+        logger.info(f"Starting test suite: {self.suite_name} ({len(self.tests)} tests)")
+        
+        results = []
+        passed = 0
+        failed = 0
+        errors = 0
+        
+        for test in self.tests:
+            logger.info(f"Running test {len(results) + 1}/{len(self.tests)}: {test.name}")
+            result = await test.run()
+            results.append(result)
+            
+            if result['status'] == 'passed':
+                passed += 1
+            elif result['status'] == 'failed':
+                failed += 1
+            elif result['status'] == 'error':
+                errors += 1
+            
+            # Brief pause between tests
+            await asyncio.sleep(2.0)
+        
+        self.end_time = datetime.now()
+        duration = (self.end_time - self.start_time).total_seconds()
+        
+        summary = {
+            'suite_name': self.suite_name,
+            'total_tests': len(self.tests),
+            'passed': passed,
+            'failed': failed,
+            'errors': errors,
+            'duration_seconds': duration,
+            'tests': results,
+            'start_time': self.start_time.isoformat(),
+            'end_time': self.end_time.isoformat()
+        }
+        
+        logger.info(f"Test suite completed: {passed} passed, {failed} failed, {errors} errors")
+        return summary
+
+
+# Pre-configured test scenarios
+
+def create_basic_disconnect_test(gpu_id: int = 0) -> DisconnectTestScenario:
+    """Basic disconnect test - compute workload + disconnect"""
+    return DisconnectTestScenario(
+        test_id=f"basic_disconnect_gpu{gpu_id}_{int(time.time())}",
+        name="Basic Disconnect Test",
+        description="Start compute workload, wait, then disconnect GPU",
+        gpu_id=gpu_id,
+        workload_type=WorkloadType.COMPUTE_INTENSIVE,
+        workload_duration=15.0,
+        disconnect_delay=3.0,
+        disconnect_method="auto",
+        disconnect_duration=5.0
+    )
+
+
+def create_memory_stress_disconnect_test(gpu_id: int = 0) -> DisconnectTestScenario:
+    """Memory stress disconnect test"""
+    return DisconnectTestScenario(
+        test_id=f"memory_disconnect_gpu{gpu_id}_{int(time.time())}",
+        name="Memory Stress Disconnect Test",
+        description="Memory allocation stress test during disconnect",
+        gpu_id=gpu_id,
+        workload_type=WorkloadType.MEMORY_STRESS,
+        workload_duration=20.0,
+        disconnect_delay=4.0,
+        disconnect_method="auto",
+        disconnect_duration=5.0
+    )
+
+
+def create_immediate_disconnect_test(gpu_id: int = 0) -> DisconnectTestScenario:
+    """Immediate disconnect test - disconnect right after workload starts"""
+    return DisconnectTestScenario(
+        test_id=f"immediate_disconnect_gpu{gpu_id}_{int(time.time())}",
+        name="Immediate Disconnect Test",
+        description="Disconnect GPU immediately after workload starts",
+        gpu_id=gpu_id,
+        workload_type=WorkloadType.LONG_RUNNING,
+        workload_duration=30.0,
+        disconnect_delay=1.0,
+        disconnect_method="logical",
+        disconnect_duration=3.0
+    )
+
+
+def create_continuous_workload_test(gpu_id: int = 0) -> DisconnectTestScenario:
+    """Continuous workload disconnect test"""
+    return DisconnectTestScenario(
+        test_id=f"continuous_disconnect_gpu{gpu_id}_{int(time.time())}",
+        name="Continuous Workload Disconnect",
+        description="Continuous rapid operations during disconnect",
+        gpu_id=gpu_id,
+        workload_type=WorkloadType.CONTINUOUS,
+        workload_duration=25.0,
+        disconnect_delay=5.0,
+        disconnect_method="auto",
+        disconnect_duration=7.0
+    )
+
+
+def create_standard_test_suite(gpu_id: int = 0) -> DisconnectTestSuite:
+    """Create standard test suite with common scenarios"""
+    suite = DisconnectTestSuite(f"Standard Disconnect Tests (GPU {gpu_id})")
+    
+    suite.add_test(create_basic_disconnect_test(gpu_id))
+    suite.add_test(create_memory_stress_disconnect_test(gpu_id))
+    suite.add_test(create_immediate_disconnect_test(gpu_id))
+    suite.add_test(create_continuous_workload_test(gpu_id))
+    
+    return suite
+
+
+# Main test execution
+async def main():
+    """Run test suite"""
+    if not TORCH_AVAILABLE:
+        logger.error("PyTorch with CUDA not available - cannot run GPU tests")
+        logger.info("Install PyTorch with CUDA support: pip install torch --index-url https://download.pytorch.org/whl/cu118")
+        return
+    
+    import torch
+    gpu_count = torch.cuda.device_count()
+    logger.info(f"Found {gpu_count} GPU(s) available for testing")
+    
+    if gpu_count == 0:
+        logger.error("No GPUs available for testing")
+        return
+    
+    # Run standard test suite on GPU 0
+    suite = create_standard_test_suite(gpu_id=0)
+    results = await suite.run_all()
+    
+    # Print summary
+    print("\n" + "="*80)
+    print(f"Test Suite: {results['suite_name']}")
+    print("="*80)
+    print(f"Total Tests: {results['total_tests']}")
+    print(f"Passed: {results['passed']}")
+    print(f"Failed: {results['failed']}")
+    print(f"Errors: {results['errors']}")
+    print(f"Duration: {results['duration_seconds']:.2f}s")
+    print("="*80)
+    
+    # Print individual test results
+    for test in results['tests']:
+        status_symbol = "✓" if test['status'] == 'passed' else "✗"
+        print(f"{status_symbol} {test['name']}: {test['status'].upper()}")
+        if test['errors']:
+            for error in test['errors']:
+                print(f"  Error: {error}")
+    
+    print("="*80)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From c1b03f58194684727bba2ee0c9dba86120e2dfa9 Mon Sep 17 00:00:00 2001
From: SpyrosMouselinos <mouselinos.spur.kw@gmail.com>
Date: Thu, 23 Oct 2025 13:17:06 +0200
Subject: [PATCH 3/5] Works on WSL2+Linux

---
 README.md                                |  36 +-
 core/gpu_disconnect.py                   | 190 +++++++++-
 core/gpu_test_workloads.py               | 432 -----------------------
 core/handlers.py                         | 211 ++++++-----
 core/monitor.py                          |  40 ++-
 test_quick_validation.py                 | 227 ------------
 tests/README.md                          | 243 -------------
 tests/test_gpu_disconnect_integration.py | 407 ---------------------
 8 files changed, 327 insertions(+), 1459 deletions(-)
 delete mode 100644 core/gpu_test_workloads.py
 delete mode 100644 test_quick_validation.py
 delete mode 100644 tests/README.md
 delete mode 100644 tests/test_gpu_disconnect_integration.py

diff --git a/README.md b/README.md
index 109db98..45f503a 100644
--- a/README.md
+++ b/README.md
@@ -128,35 +128,19 @@ POST /api/hub/gpu/disconnect-multiple
 
 ### Integration Testing
 
-GPU Hot includes a comprehensive testing framework to validate disconnect functionality:
-
-**Run Full Test Suite:**
-```bash
-cd tests
-sudo python3 test_gpu_disconnect_integration.py
-```
+GPU Hot includes comprehensive API testing for disconnect functionality:
 
 **Manual API Testing:**
 ```bash
-# 1. Create GPU workload
-curl -X POST http://localhost:1312/api/gpu/workload/create \
+# Test disconnect functionality
+curl -X POST http://localhost:1312/api/gpu/disconnect-multiple \
   -H "Content-Type: application/json" \
-  -d '{"gpu_id": 0, "workload_type": "compute_intensive", "duration": 30.0}'
-
-# 2. Start workload (use workload_id from response)
-curl -X POST http://localhost:1312/api/gpu/workload/{workload_id}/start
+  -d '{"gpu_indices": [0], "method": "auto", "down_time": 10}'
 
-# 3. Trigger disconnect while workload is running
-curl -X POST http://localhost:1312/api/gpu/0/disconnect \
-  -H "Content-Type": application/json" \
-  -d '{"method": "auto", "down_time": 5.0}'
-
-# 4. Check workload status (should be "interrupted" or "failed")
-curl http://localhost:1312/api/gpu/workload/{workload_id}/status
+# Check disconnect status
+curl http://localhost:1312/api/gpu/disconnect/status
 ```
 
-See [`tests/README.md`](tests/README.md) for detailed testing documentation.
-
 ---
 
 ## Configuration
@@ -197,14 +181,6 @@ GET  /api/hub/gpu/{node}/{gpu_id}/disconnect/methods  # Get methods for node GPU
 POST /api/hub/gpu/{node}/{gpu_id}/disconnect   # Disconnect GPU on specific node
 POST /api/hub/gpu/disconnect-multiple          # Multi-node batch disconnect
 GET  /api/hub/gpu/disconnect/status             # Hub-wide disconnect status
-
-# GPU Workload Testing API
-POST   /api/gpu/workload/create                # Create new GPU workload
-POST   /api/gpu/workload/{id}/start            # Start workload
-POST   /api/gpu/workload/{id}/stop             # Stop workload
-GET    /api/gpu/workload/{id}/status           # Get workload status
-GET    /api/gpu/workloads                      # List all workloads
-DELETE /api/gpu/workloads/cleanup              # Clean up completed workloads
 ```
 
 ### WebSocket
diff --git a/core/gpu_disconnect.py b/core/gpu_disconnect.py
index 29f3457..e5084b6 100644
--- a/core/gpu_disconnect.py
+++ b/core/gpu_disconnect.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 from typing import Optional, Dict, List
 from enum import Enum
+import pynvml
 
 logger = logging.getLogger(__name__)
 
@@ -19,14 +20,36 @@
 SYSFS_PCI_SLOTS = Path("/sys/bus/pci/slots")
 SYSFS_PCI_RESCAN = Path("/sys/bus/pci/rescan")
 
+# Global state for simulated disconnects
+_simulated_offline_gpus = set()
+
+
+def is_wsl2() -> bool:
+    """Detect if running in WSL2"""
+    try:
+        with open('/proc/version', 'r') as f:
+            version = f.read().lower()
+            return 'wsl2' in version or 'microsoft' in version
+    except Exception:
+        return False
+
+
+def is_gpu_simulated_offline(gpu_index: int) -> bool:
+    """Check if GPU is in simulated offline state"""
+    return gpu_index in _simulated_offline_gpus
+
 
 class DisconnectMethod(Enum):
     """Available GPU disconnect methods"""
     AUTO = "auto"
+    # Real PCI disconnects (Linux native only)
     SLOT_POWER = "slot"
     HOT_RESET = "hot"
     LOGICAL = "logical"
+    # WSL2-compatible methods
     NVIDIA_RESET = "nvidia"
+    SIMULATED = "simulated"
+    MEMORY_FLOOD = "memory_flood"  # Experimental
 
 
 class GPUDisconnectError(Exception):
@@ -44,6 +67,12 @@ def _check_root_permissions(self):
         """Check if running with sufficient privileges"""
         if os.geteuid() != 0:
             logger.warning("GPU disconnect requires root privileges. Operations may fail.")
+        
+        # Log environment detection
+        if is_wsl2():
+            logger.info("WSL2 environment detected - PCI methods unavailable, will use WSL2-compatible methods")
+        else:
+            logger.info("Native Linux environment detected - all disconnect methods available")
 
     async def disconnect_gpu(
         self, 
@@ -73,7 +102,7 @@ async def disconnect_gpu(
                 logger.warning(f"GPU {gpu_index} has {len(processes)} active processes")
             
             # Perform disconnect/reconnect
-            result = await self._execute_disconnect(bdf, method, down_time)
+            result = await self._execute_disconnect(bdf, method, down_time, gpu_index)
             result.update({
                 'gpu_index': gpu_index,
                 'bdf': bdf,
@@ -214,10 +243,10 @@ async def _check_gpu_processes(self, gpu_index: int) -> List[Dict]:
         except Exception:
             return []
 
-    async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_time: float) -> Dict:
+    async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_time: float, gpu_index: int = None) -> Dict:
         """Execute the actual disconnect/reconnect operation"""
         if method == DisconnectMethod.AUTO:
-            method = await self._select_best_method(bdf)
+            method = await self._select_best_method(bdf, gpu_index)
         
         start_time = time.time()
         
@@ -229,7 +258,11 @@ async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_tim
             elif method == DisconnectMethod.LOGICAL:
                 await self._logical_disconnect(bdf, down_time)
             elif method == DisconnectMethod.NVIDIA_RESET:
-                await self._nvidia_reset_disconnect(bdf, down_time)
+                await self._nvidia_reset_disconnect(bdf, down_time, gpu_index)
+            elif method == DisconnectMethod.SIMULATED:
+                await self._simulated_disconnect(gpu_index, down_time)
+            elif method == DisconnectMethod.MEMORY_FLOOD:
+                await self._memory_flood_disconnect(gpu_index, down_time)
             else:
                 raise GPUDisconnectError(f"Unsupported method: {method}")
             
@@ -250,8 +283,21 @@ async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_tim
                 'error': str(e)
             }
 
-    async def _select_best_method(self, bdf: str) -> DisconnectMethod:
-        """Select the best available method for maximum realism"""
+    async def _select_best_method(self, bdf: str, gpu_index: int = None) -> DisconnectMethod:
+        """Select the best available method based on environment"""
+        
+        # WSL2 detection - use soft methods
+        if is_wsl2():
+            logger.info("WSL2 detected - using SIMULATED disconnect (PCI methods unavailable)")
+            return DisconnectMethod.SIMULATED
+        
+        # Native Linux - check PCI capabilities
+        device_path = SYSFS_PCI_DEVICES / bdf
+        if not device_path.exists():
+            logger.warning(f"PCI device {bdf} not accessible - falling back to SIMULATED")
+            return DisconnectMethod.SIMULATED
+        
+        # Use real PCI methods in order of preference
         if self._has_slot_power(bdf):
             return DisconnectMethod.SLOT_POWER
         elif self._has_hot_reset_capability(bdf):
@@ -403,25 +449,72 @@ async def _hot_reset_disconnect(self, bdf: str, down_time: float):
 
     async def _logical_disconnect(self, bdf: str, down_time: float):
         """Execute logical disconnect (remove/rescan)"""
-        logger.info(f"Executing logical disconnect for {bdf}")
+        logger.info(f"[DISCONNECT START] GPU {bdf} - target down_time: {down_time}s")
+        
+        device_path = SYSFS_PCI_DEVICES / bdf
+        
+        # Log state before removal
+        try:
+            nvml_count_pre = pynvml.nvmlDeviceGetCount()
+        except Exception as e:
+            nvml_count_pre = f"Error: {e}"
+        
+        logger.info(f"[PRE-REMOVE] Device path exists: {device_path.exists()}")
+        logger.info(f"[PRE-REMOVE] NVML device count: {nvml_count_pre}")
         
         # Unbind and remove
         await self._unbind_driver(bdf)
-        await self._write_sysfs(SYSFS_PCI_DEVICES / bdf / "remove", "1")
+        logger.info(f"[REMOVE] Writing '1' to {device_path / 'remove'}")
+        await self._write_sysfs(device_path / "remove", "1")
+        
+        # Wait briefly for removal to take effect, then verify
+        await asyncio.sleep(0.5)
+        
+        try:
+            nvml_count_post = pynvml.nvmlDeviceGetCount()
+        except Exception as e:
+            nvml_count_post = f"Error: {e}"
+        
+        logger.info(f"[POST-REMOVE] Device path exists: {device_path.exists()}")
+        logger.info(f"[POST-REMOVE] NVML device count: {nvml_count_post}")
+        
+        if device_path.exists():
+            logger.warning(f"[POST-REMOVE] WARNING: Device {bdf} still exists after removal!")
+        else:
+            logger.info(f"[POST-REMOVE] Confirmed: Device {bdf} successfully removed from PCI bus")
         
+        # Sleep for down_time
+        sleep_start = time.time()
+        logger.info(f"[SLEEP START] Sleeping for {down_time}s to simulate disconnect")
         await asyncio.sleep(down_time)
+        sleep_duration = time.time() - sleep_start
+        logger.info(f"[SLEEP END] Actual sleep duration: {sleep_duration:.2f}s")
         
-        # Rescan
+        # Rescan PCI bus
+        logger.info(f"[RESCAN] Triggering PCI bus rescan")
         await self._write_sysfs(SYSFS_PCI_RESCAN, "1")
+        
+        # Wait for device to reappear
+        logger.info(f"[RESCAN] Waiting for {bdf} to reappear (timeout: 30s)")
         await self._wait_for_condition(
             lambda: (SYSFS_PCI_DEVICES / bdf).exists(),
             timeout=30,
             description=f"{bdf} to reappear"
         )
+        
+        # Verify reconnection
+        try:
+            nvml_count_final = pynvml.nvmlDeviceGetCount()
+        except Exception as e:
+            nvml_count_final = f"Error: {e}"
+        
+        logger.info(f"[POST-RESCAN] Device path exists: {device_path.exists()}")
+        logger.info(f"[POST-RESCAN] NVML device count: {nvml_count_final}")
+        logger.info(f"[DISCONNECT END] GPU {bdf} reconnected successfully")
 
-    async def _nvidia_reset_disconnect(self, bdf: str, down_time: float):
-        """Execute NVIDIA GPU reset"""
-        logger.info(f"Executing NVIDIA reset for {bdf}")
+    async def _nvidia_reset_disconnect(self, bdf: str, down_time: float, gpu_index: int = None):
+        """Execute NVIDIA GPU reset using nvidia-smi"""
+        logger.info(f"[NVIDIA-RESET] Resetting GPU {gpu_index if gpu_index is not None else 'unknown'} ({bdf})")
         
         # Find GPU index from BDF
         gpu_index = await self._get_gpu_index_from_bdf(bdf)
@@ -521,6 +614,79 @@ async def _wait_for_condition(self, condition, timeout: int, description: str):
             await asyncio.sleep(0.25)
         
         raise GPUDisconnectError(f"Timeout waiting for {description}")
+    
+    async def _simulated_disconnect(self, gpu_index: int, down_time: float):
+        """Simulate disconnect in software only - WSL2 safe"""
+        logger.info(f"[SIMULATED] Marking GPU {gpu_index} as offline for {down_time}s")
+        logger.info(f"[SIMULATED] This is a software-only simulation - GPU remains physically available")
+        
+        # Add to simulated offline set
+        _simulated_offline_gpus.add(gpu_index)
+        
+        try:
+            logger.info(f"[SIMULATED] GPU {gpu_index} now appears 'disconnected' to monitor")
+            await asyncio.sleep(down_time)
+        finally:
+            # Remove from offline set
+            if gpu_index in _simulated_offline_gpus:
+                _simulated_offline_gpus.remove(gpu_index)
+            logger.info(f"[SIMULATED] GPU {gpu_index} back online - disconnect simulation complete")
+    
+    async def _memory_flood_disconnect(self, gpu_index: int, down_time: float):
+        """Flood GPU memory to trigger potential OOM/driver reset - EXPERIMENTAL"""
+        logger.warning(f"[MEMORY-FLOOD] Starting EXPERIMENTAL memory flood on GPU {gpu_index}")
+        logger.warning(f"[MEMORY-FLOOD] This may cause unpredictable behavior or system instability!")
+        
+        try:
+            import torch
+        except ImportError:
+            raise GPUDisconnectError("PyTorch not available - memory flood requires torch")
+        
+        try:
+            torch.cuda.set_device(gpu_index)
+            total_mem = torch.cuda.get_device_properties(gpu_index).total_memory
+            logger.info(f"[MEMORY-FLOOD] GPU {gpu_index} total memory: {total_mem / 1e9:.2f}GB")
+            
+            allocations = []
+            allocated_bytes = 0
+            chunk_size = 100 * 1024 * 1024  # 100MB chunks
+            
+            # Phase 1: Allocate until OOM
+            logger.info(f"[MEMORY-FLOOD] Phase 1: Allocating memory until OOM...")
+            try:
+                while allocated_bytes < total_mem * 0.95:  # Don't try to allocate 100%
+                    tensor = torch.empty(chunk_size // 4, dtype=torch.float32, device=f'cuda:{gpu_index}')
+                    allocations.append(tensor)
+                    allocated_bytes += chunk_size
+                    
+                    if len(allocations) % 10 == 0:
+                        logger.debug(f"[MEMORY-FLOOD] Allocated {allocated_bytes / 1e9:.2f}GB")
+                        
+            except RuntimeError as e:
+                if "out of memory" in str(e).lower():
+                    logger.info(f"[MEMORY-FLOOD] OOM reached at {allocated_bytes / 1e9:.2f}GB: {e}")
+                else:
+                    raise
+            
+            # Phase 2: Hold memory for down_time
+            logger.info(f"[MEMORY-FLOOD] Phase 2: Holding {allocated_bytes / 1e9:.2f}GB for {down_time}s")
+            logger.info(f"[MEMORY-FLOOD] GPU {gpu_index} should be unresponsive during this time")
+            
+            await asyncio.sleep(down_time)
+            
+        except Exception as e:
+            logger.error(f"[MEMORY-FLOOD] Error during memory flood: {e}")
+            raise
+        finally:
+            # Phase 3: Release memory
+            logger.info(f"[MEMORY-FLOOD] Phase 3: Releasing memory...")
+            allocations.clear()
+            
+            if 'torch' in dir():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize(gpu_index)
+            
+            logger.info(f"[MEMORY-FLOOD] Memory flood complete - GPU {gpu_index} should recover")
 
 
 # Global instance
diff --git a/core/gpu_test_workloads.py b/core/gpu_test_workloads.py
deleted file mode 100644
index f97f3d4..0000000
--- a/core/gpu_test_workloads.py
+++ /dev/null
@@ -1,432 +0,0 @@
-#!/usr/bin/env python3
-"""
-GPU Test Workloads - Generate various GPU operations for disconnect testing
-Uses PyTorch/CuPy for CUDA operations without requiring custom CUDA code
-"""
-
-import asyncio
-import logging
-import time
-import threading
-from datetime import datetime
-from typing import Optional, Dict, List
-from enum import Enum
-
-logger = logging.getLogger(__name__)
-
-# Try to import GPU libraries
-try:
-    import torch
-    TORCH_AVAILABLE = torch.cuda.is_available()
-except ImportError:
-    TORCH_AVAILABLE = False
-    logger.warning("PyTorch not available - GPU workload tests will be limited")
-
-try:
-    import cupy as cp
-    CUPY_AVAILABLE = True
-except ImportError:
-    CUPY_AVAILABLE = False
-    logger.warning("CuPy not available - using PyTorch for workloads")
-
-
-class WorkloadType(Enum):
-    """Types of GPU workloads for testing"""
-    MEMORY_STRESS = "memory_stress"
-    COMPUTE_INTENSIVE = "compute_intensive"
-    LONG_RUNNING = "long_running"
-    CONTINUOUS = "continuous"
-    MIXED = "mixed"
-
-
-class WorkloadStatus(Enum):
-    """Status of a running workload"""
-    PENDING = "pending"
-    RUNNING = "running"
-    COMPLETED = "completed"
-    FAILED = "failed"
-    INTERRUPTED = "interrupted"
-
-
-class GPUWorkload:
-    """Represents a single GPU workload operation"""
-    
-    def __init__(self, workload_id: str, gpu_id: int, workload_type: WorkloadType, duration: float = 10.0):
-        self.workload_id = workload_id
-        self.gpu_id = gpu_id
-        self.workload_type = workload_type
-        self.duration = duration
-        self.status = WorkloadStatus.PENDING
-        self.start_time = None
-        self.end_time = None
-        self.error = None
-        self.progress = 0.0
-        self.iterations_completed = 0
-        self.expected_iterations = 100
-        self._stop_event = threading.Event()
-        self._thread = None
-    
-    def start(self):
-        """Start the workload in a background thread"""
-        if self.status != WorkloadStatus.PENDING:
-            raise RuntimeError(f"Workload {self.workload_id} already started")
-        
-        self.status = WorkloadStatus.RUNNING
-        self.start_time = datetime.now()
-        
-        # Run in separate thread to avoid blocking
-        self._thread = threading.Thread(target=self._run_workload, daemon=True)
-        self._thread.start()
-    
-    def stop(self):
-        """Stop the workload gracefully"""
-        self._stop_event.set()
-        if self._thread:
-            self._thread.join(timeout=5.0)
-    
-    def _run_workload(self):
-        """Execute the actual GPU workload"""
-        try:
-            if self.workload_type == WorkloadType.MEMORY_STRESS:
-                self._memory_stress()
-            elif self.workload_type == WorkloadType.COMPUTE_INTENSIVE:
-                self._compute_intensive()
-            elif self.workload_type == WorkloadType.LONG_RUNNING:
-                self._long_running()
-            elif self.workload_type == WorkloadType.CONTINUOUS:
-                self._continuous()
-            elif self.workload_type == WorkloadType.MIXED:
-                self._mixed()
-            else:
-                raise ValueError(f"Unknown workload type: {self.workload_type}")
-            
-            if not self._stop_event.is_set():
-                self.status = WorkloadStatus.COMPLETED
-                self.end_time = datetime.now()
-                logger.info(f"Workload {self.workload_id} completed successfully")
-            else:
-                self.status = WorkloadStatus.INTERRUPTED
-                self.end_time = datetime.now()
-                logger.info(f"Workload {self.workload_id} interrupted")
-                
-        except Exception as e:
-            self.status = WorkloadStatus.FAILED
-            self.end_time = datetime.now()
-            self.error = str(e)
-            logger.error(f"Workload {self.workload_id} failed: {e}")
-    
-    def _memory_stress(self):
-        """Allocate and deallocate GPU memory repeatedly"""
-        if TORCH_AVAILABLE:
-            logger.info(f"Starting memory stress test on GPU {self.gpu_id}")
-            device = torch.device(f'cuda:{self.gpu_id}')
-            
-            iteration = 0
-            start = time.time()
-            
-            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
-                try:
-                    # Allocate large tensors
-                    tensors = []
-                    for _ in range(10):
-                        if self._stop_event.is_set():
-                            break
-                        # Allocate ~100MB per tensor
-                        tensor = torch.randn(1024, 1024, 25, device=device)
-                        tensors.append(tensor)
-                    
-                    # Do some operations
-                    if tensors and not self._stop_event.is_set():
-                        result = torch.stack(tensors).sum()
-                        _ = result.cpu()  # Force computation
-                    
-                    # Deallocate
-                    del tensors
-                    torch.cuda.empty_cache()
-                    
-                    iteration += 1
-                    self.iterations_completed = iteration
-                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
-                    
-                    time.sleep(0.1)  # Brief pause between iterations
-                    
-                except RuntimeError as e:
-                    if "CUDA" in str(e) or "out of memory" in str(e):
-                        raise  # GPU-related errors should propagate
-                    logger.warning(f"Non-critical error in memory stress: {e}")
-        else:
-            # Fallback without GPU
-            logger.warning("PyTorch CUDA not available, simulating memory stress")
-            time.sleep(self.duration)
-            self.iterations_completed = 100
-    
-    def _compute_intensive(self):
-        """Perform compute-intensive matrix operations"""
-        if TORCH_AVAILABLE:
-            logger.info(f"Starting compute-intensive test on GPU {self.gpu_id}")
-            device = torch.device(f'cuda:{self.gpu_id}')
-            
-            iteration = 0
-            start = time.time()
-            
-            # Create large matrices
-            size = 2048
-            matrix_a = torch.randn(size, size, device=device)
-            matrix_b = torch.randn(size, size, device=device)
-            
-            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
-                try:
-                    # Matrix multiplication (compute-heavy)
-                    result = torch.matmul(matrix_a, matrix_b)
-                    
-                    # Additional operations
-                    result = torch.nn.functional.relu(result)
-                    result = torch.nn.functional.softmax(result, dim=1)
-                    
-                    # Force synchronization
-                    torch.cuda.synchronize(device)
-                    
-                    iteration += 1
-                    self.iterations_completed = iteration
-                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
-                    
-                except RuntimeError as e:
-                    if "CUDA" in str(e):
-                        raise
-                    logger.warning(f"Non-critical error in compute test: {e}")
-            
-            del matrix_a, matrix_b
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("PyTorch CUDA not available, simulating compute workload")
-            time.sleep(self.duration)
-            self.iterations_completed = 100
-    
-    def _long_running(self):
-        """Single long-running operation"""
-        if TORCH_AVAILABLE:
-            logger.info(f"Starting long-running test on GPU {self.gpu_id}")
-            device = torch.device(f'cuda:{self.gpu_id}')
-            
-            try:
-                # Create very large operation that takes time
-                size = 4096
-                matrix = torch.randn(size, size, device=device)
-                
-                start = time.time()
-                iterations = int(self.duration * 10)  # Adjust based on duration
-                
-                for i in range(iterations):
-                    if self._stop_event.is_set():
-                        break
-                    
-                    # Chain of operations
-                    result = torch.matmul(matrix, matrix)
-                    result = result + matrix
-                    result = torch.nn.functional.relu(result)
-                    matrix = result / result.max()
-                    
-                    torch.cuda.synchronize(device)
-                    
-                    self.iterations_completed = i + 1
-                    self.expected_iterations = iterations
-                    self.progress = min(100.0, (i + 1) / iterations * 100)
-                
-                del matrix, result
-                torch.cuda.empty_cache()
-                
-            except RuntimeError as e:
-                if "CUDA" in str(e):
-                    raise
-                logger.warning(f"Error in long-running test: {e}")
-        else:
-            logger.warning("PyTorch CUDA not available, simulating long-running workload")
-            time.sleep(self.duration)
-            self.iterations_completed = 100
-    
-    def _continuous(self):
-        """Continuous background operations"""
-        if TORCH_AVAILABLE:
-            logger.info(f"Starting continuous test on GPU {self.gpu_id}")
-            device = torch.device(f'cuda:{self.gpu_id}')
-            
-            iteration = 0
-            start = time.time()
-            
-            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
-                try:
-                    # Rapid small operations
-                    tensor = torch.randn(512, 512, device=device)
-                    result = tensor @ tensor.T
-                    _ = result.sum().item()
-                    
-                    iteration += 1
-                    self.iterations_completed = iteration
-                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
-                    
-                except RuntimeError as e:
-                    if "CUDA" in str(e):
-                        raise
-                    time.sleep(0.01)
-            
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("PyTorch CUDA not available, simulating continuous workload")
-            time.sleep(self.duration)
-            self.iterations_completed = 100
-    
-    def _mixed(self):
-        """Mixed workload combining memory and compute"""
-        if TORCH_AVAILABLE:
-            logger.info(f"Starting mixed test on GPU {self.gpu_id}")
-            device = torch.device(f'cuda:{self.gpu_id}')
-            
-            iteration = 0
-            start = time.time()
-            
-            while not self._stop_event.is_set() and (time.time() - start) < self.duration:
-                try:
-                    # Alternate between memory and compute
-                    if iteration % 2 == 0:
-                        # Memory operations
-                        tensors = [torch.randn(1024, 1024, device=device) for _ in range(5)]
-                        _ = torch.stack(tensors).sum()
-                        del tensors
-                    else:
-                        # Compute operations
-                        a = torch.randn(1024, 1024, device=device)
-                        b = torch.randn(1024, 1024, device=device)
-                        c = torch.matmul(a, b)
-                        _ = c.sum()
-                        del a, b, c
-                    
-                    torch.cuda.synchronize(device)
-                    torch.cuda.empty_cache()
-                    
-                    iteration += 1
-                    self.iterations_completed = iteration
-                    self.progress = min(100.0, (time.time() - start) / self.duration * 100)
-                    
-                    time.sleep(0.1)
-                    
-                except RuntimeError as e:
-                    if "CUDA" in str(e):
-                        raise
-                    logger.warning(f"Error in mixed workload: {e}")
-            
-        else:
-            logger.warning("PyTorch CUDA not available, simulating mixed workload")
-            time.sleep(self.duration)
-            self.iterations_completed = 100
-    
-    def get_status(self) -> Dict:
-        """Get current workload status"""
-        duration = None
-        if self.start_time:
-            end = self.end_time or datetime.now()
-            duration = (end - self.start_time).total_seconds()
-        
-        return {
-            'workload_id': self.workload_id,
-            'gpu_id': self.gpu_id,
-            'type': self.workload_type.value,
-            'status': self.status.value,
-            'progress': self.progress,
-            'iterations_completed': self.iterations_completed,
-            'expected_iterations': self.expected_iterations,
-            'duration_seconds': duration,
-            'error': self.error,
-            'start_time': self.start_time.isoformat() if self.start_time else None,
-            'end_time': self.end_time.isoformat() if self.end_time else None
-        }
-
-
-class GPUWorkloadManager:
-    """Manages multiple GPU workloads"""
-    
-    def __init__(self):
-        self.workloads: Dict[str, GPUWorkload] = {}
-        self.workload_counter = 0
-    
-    def create_workload(
-        self, 
-        gpu_id: int, 
-        workload_type: WorkloadType = WorkloadType.COMPUTE_INTENSIVE,
-        duration: float = 10.0
-    ) -> str:
-        """Create a new workload"""
-        self.workload_counter += 1
-        workload_id = f"workload_{self.workload_counter}_{int(time.time())}"
-        
-        workload = GPUWorkload(workload_id, gpu_id, workload_type, duration)
-        self.workloads[workload_id] = workload
-        
-        logger.info(f"Created workload {workload_id} for GPU {gpu_id}: {workload_type.value}")
-        return workload_id
-    
-    def start_workload(self, workload_id: str):
-        """Start a pending workload"""
-        if workload_id not in self.workloads:
-            raise ValueError(f"Workload {workload_id} not found")
-        
-        workload = self.workloads[workload_id]
-        workload.start()
-        logger.info(f"Started workload {workload_id}")
-    
-    def stop_workload(self, workload_id: str):
-        """Stop a running workload"""
-        if workload_id not in self.workloads:
-            raise ValueError(f"Workload {workload_id} not found")
-        
-        workload = self.workloads[workload_id]
-        workload.stop()
-        logger.info(f"Stopped workload {workload_id}")
-    
-    def get_workload_status(self, workload_id: str) -> Dict:
-        """Get status of a specific workload"""
-        if workload_id not in self.workloads:
-            raise ValueError(f"Workload {workload_id} not found")
-        
-        return self.workloads[workload_id].get_status()
-    
-    def get_all_workloads(self) -> List[Dict]:
-        """Get status of all workloads"""
-        return [w.get_status() for w in self.workloads.values()]
-    
-    def get_active_workloads(self) -> List[Dict]:
-        """Get status of currently running workloads"""
-        return [
-            w.get_status() 
-            for w in self.workloads.values() 
-            if w.status == WorkloadStatus.RUNNING
-        ]
-    
-    def cleanup_completed(self):
-        """Remove completed/failed workloads older than 5 minutes"""
-        cutoff = time.time() - 300  # 5 minutes ago
-        to_remove = []
-        
-        for wid, workload in self.workloads.items():
-            if workload.status in [WorkloadStatus.COMPLETED, WorkloadStatus.FAILED, WorkloadStatus.INTERRUPTED]:
-                if workload.end_time:
-                    end_timestamp = workload.end_time.timestamp()
-                    if end_timestamp < cutoff:
-                        to_remove.append(wid)
-        
-        for wid in to_remove:
-            del self.workloads[wid]
-        
-        if to_remove:
-            logger.info(f"Cleaned up {len(to_remove)} old workloads")
-    
-    def stop_all(self):
-        """Stop all running workloads"""
-        for workload in self.workloads.values():
-            if workload.status == WorkloadStatus.RUNNING:
-                workload.stop()
-        
-        logger.info("Stopped all workloads")
-
-
-# Global workload manager instance
-workload_manager = GPUWorkloadManager()
diff --git a/core/handlers.py b/core/handlers.py
index 8d481df..3ea26ab 100644
--- a/core/handlers.py
+++ b/core/handlers.py
@@ -10,7 +10,6 @@
 from pydantic import BaseModel
 from . import config
 from .gpu_disconnect import disconnect_gpu, disconnect_multiple_gpus, get_available_methods, GPUDisconnectError
-from .gpu_test_workloads import workload_manager, WorkloadType
 
 logger = logging.getLogger(__name__)
 
@@ -30,12 +29,6 @@ class MultiDisconnectRequest(BaseModel):
     down_time: float = 5.0
 
 
-class WorkloadRequest(BaseModel):
-    gpu_id: int
-    workload_type: str = "compute_intensive"
-    duration: float = 10.0
-
-
 def register_handlers(app, monitor):
     """Register FastAPI WebSocket handlers"""
     
@@ -63,11 +56,20 @@ async def websocket_endpoint(websocket: WebSocket):
     async def get_disconnect_methods(gpu_id: int):
         """Get available disconnect methods for a GPU"""
         try:
+            from .gpu_disconnect import is_wsl2
+            
             methods = await get_available_methods(gpu_id)
+            in_wsl2 = is_wsl2()
+            
             return {
                 "gpu_id": gpu_id,
                 "available_methods": methods,
-                "default_method": "auto"
+                "default_method": "auto",
+                "environment": {
+                    "is_wsl2": in_wsl2,
+                    "recommended_method": "simulated" if in_wsl2 else "auto",
+                    "pci_available": not in_wsl2
+                }
             }
         except Exception as e:
             logger.error(f"Error getting disconnect methods for GPU {gpu_id}: {e}")
@@ -115,10 +117,72 @@ async def disconnect_multiple(request: MultiDisconnectRequest):
             logger.error(f"Unexpected error during multi-GPU disconnect: {e}")
             raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
     
+    @app.get("/api/gpu/verify-disconnect/{gpu_id}")
+    async def verify_gpu_disconnect(gpu_id: int):
+        """Verify GPU visibility - check if GPU exists via NVML, nvidia-smi, and sysfs"""
+        import subprocess
+        from pathlib import Path
+        
+        result = {
+            "gpu_id": gpu_id,
+            "timestamp": datetime.now().isoformat(),
+            "checks": {}
+        }
+        
+        # Check NVML device count
+        try:
+            import pynvml
+            device_count = pynvml.nvmlDeviceGetCount()
+            result["checks"]["nvml_total_devices"] = device_count
+            result["checks"]["nvml_status"] = "success"
+            
+            # Try to get handle for specific GPU
+            try:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
+                pci_info = pynvml.nvmlDeviceGetPciInfo(handle)
+                result["checks"]["nvml_gpu_exists"] = True
+                result["checks"]["nvml_pci_bdf"] = pci_info.busId.decode('utf-8')
+            except Exception as e:
+                result["checks"]["nvml_gpu_exists"] = False
+                result["checks"]["nvml_gpu_error"] = str(e)
+        except Exception as e:
+            result["checks"]["nvml_status"] = f"error: {e}"
+        
+        # Check nvidia-smi
+        try:
+            smi_result = subprocess.run(
+                ['nvidia-smi', '--query-gpu=index,name,pci.bus_id', '--format=csv,noheader'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            result["checks"]["nvidia_smi_success"] = smi_result.returncode == 0
+            if smi_result.returncode == 0:
+                gpu_lines = [line for line in smi_result.stdout.strip().split('\n') if line.startswith(str(gpu_id))]
+                result["checks"]["nvidia_smi_gpu_found"] = len(gpu_lines) > 0
+                if gpu_lines:
+                    result["checks"]["nvidia_smi_output"] = gpu_lines[0]
+            else:
+                result["checks"]["nvidia_smi_error"] = smi_result.stderr
+        except Exception as e:
+            result["checks"]["nvidia_smi_success"] = False
+            result["checks"]["nvidia_smi_error"] = str(e)
+        
+        # Check PCI sysfs path
+        if "nvml_pci_bdf" in result["checks"]:
+            bdf = result["checks"]["nvml_pci_bdf"]
+            pci_path = Path(f"/sys/bus/pci/devices/{bdf}")
+            result["checks"]["pci_device_exists"] = pci_path.exists()
+            result["checks"]["pci_device_path"] = str(pci_path)
+        
+        return JSONResponse(content=result)
+    
     @app.get("/api/gpu/disconnect/status")
     async def get_disconnect_status():
         """Get current disconnect operation status and system capabilities"""
         try:
+            from .gpu_disconnect import is_wsl2
+            
             # Check root permissions
             import os
             has_root = os.geteuid() == 0
@@ -131,116 +195,49 @@ async def get_disconnect_status():
             from pathlib import Path
             sysfs_accessible = Path("/sys/bus/pci/devices").exists()
             
+            # WSL2 detection
+            in_wsl2 = is_wsl2()
+            
+            # Determine readiness based on environment
+            if in_wsl2:
+                ready = has_nvidia_smi  # WSL2 only needs nvidia-smi for some methods
+            else:
+                ready = has_root and has_nvidia_smi and sysfs_accessible
+            
+            warnings = []
+            if in_wsl2:
+                warnings.append("WSL2 detected - PCI disconnect unavailable, using simulated/soft methods")
+            else:
+                if not has_root:
+                    warnings.append("Root privileges required for PCI operations")
+                if not has_nvidia_smi:
+                    warnings.append("nvidia-smi not found in PATH")
+                if not sysfs_accessible:
+                    warnings.append("PCI sysfs interface not accessible")
+            
             return {
-                "ready": has_root and has_nvidia_smi and sysfs_accessible,
+                "ready": ready,
+                "environment": {
+                    "is_wsl2": in_wsl2,
+                    "platform": "WSL2" if in_wsl2 else "Native Linux"
+                },
                 "permissions": {
                     "root_access": has_root,
                     "nvidia_smi_available": has_nvidia_smi,
                     "sysfs_accessible": sysfs_accessible
                 },
-                "warnings": [
-                    "Root privileges required for PCI operations" if not has_root else None,
-                    "nvidia-smi not found in PATH" if not has_nvidia_smi else None,
-                    "PCI sysfs interface not accessible" if not sysfs_accessible else None
-                ]
+                "capabilities": {
+                    "pci_disconnect": not in_wsl2 and sysfs_accessible,
+                    "nvidia_reset": has_nvidia_smi,
+                    "simulated": True,
+                    "memory_flood": has_nvidia_smi  # Needs torch/CUDA
+                },
+                "warnings": [w for w in warnings if w]
             }
             
         except Exception as e:
             logger.error(f"Error checking disconnect status: {e}")
             raise HTTPException(status_code=500, detail=str(e))
-    
-    # GPU Workload Testing API Endpoints
-    @app.post("/api/gpu/workload/create")
-    async def create_workload(request: WorkloadRequest):
-        """Create a new GPU workload for testing"""
-        try:
-            workload_id = workload_manager.create_workload(
-                gpu_id=request.gpu_id,
-                workload_type=WorkloadType(request.workload_type),
-                duration=request.duration
-            )
-            
-            return {
-                "workload_id": workload_id,
-                "gpu_id": request.gpu_id,
-                "workload_type": request.workload_type,
-                "duration": request.duration,
-                "status": "created"
-            }
-            
-        except Exception as e:
-            logger.error(f"Error creating workload: {e}")
-            raise HTTPException(status_code=500, detail=str(e))
-    
-    @app.post("/api/gpu/workload/{workload_id}/start")
-    async def start_workload(workload_id: str):
-        """Start a GPU workload"""
-        try:
-            workload_manager.start_workload(workload_id)
-            status = workload_manager.get_workload_status(workload_id)
-            return status
-            
-        except ValueError as e:
-            raise HTTPException(status_code=404, detail=str(e))
-        except Exception as e:
-            logger.error(f"Error starting workload: {e}")
-            raise HTTPException(status_code=500, detail=str(e))
-    
-    @app.post("/api/gpu/workload/{workload_id}/stop")
-    async def stop_workload(workload_id: str):
-        """Stop a running GPU workload"""
-        try:
-            workload_manager.stop_workload(workload_id)
-            status = workload_manager.get_workload_status(workload_id)
-            return status
-            
-        except ValueError as e:
-            raise HTTPException(status_code=404, detail=str(e))
-        except Exception as e:
-            logger.error(f"Error stopping workload: {e}")
-            raise HTTPException(status_code=500, detail=str(e))
-    
-    @app.get("/api/gpu/workload/{workload_id}/status")
-    async def get_workload_status_api(workload_id: str):
-        """Get status of a specific workload"""
-        try:
-            status = workload_manager.get_workload_status(workload_id)
-            return status
-            
-        except ValueError as e:
-            raise HTTPException(status_code=404, detail=str(e))
-        except Exception as e:
-            logger.error(f"Error getting workload status: {e}")
-            raise HTTPException(status_code=500, detail=str(e))
-    
-    @app.get("/api/gpu/workloads")
-    async def get_all_workloads():
-        """Get status of all workloads"""
-        try:
-            workloads = workload_manager.get_all_workloads()
-            active = workload_manager.get_active_workloads()
-            
-            return {
-                "total_workloads": len(workloads),
-                "active_workloads": len(active),
-                "workloads": workloads,
-                "active": active
-            }
-            
-        except Exception as e:
-            logger.error(f"Error getting workloads: {e}")
-            raise HTTPException(status_code=500, detail=str(e))
-    
-    @app.delete("/api/gpu/workloads/cleanup")
-    async def cleanup_workloads():
-        """Clean up completed workloads"""
-        try:
-            workload_manager.cleanup_completed()
-            return {"status": "ok", "message": "Cleaned up completed workloads"}
-            
-        except Exception as e:
-            logger.error(f"Error cleaning up workloads: {e}")
-            raise HTTPException(status_code=500, detail=str(e))
 
 
 async def monitor_loop(monitor, connections):
diff --git a/core/monitor.py b/core/monitor.py
index fa1f946..a5f43fc 100644
--- a/core/monitor.py
+++ b/core/monitor.py
@@ -7,6 +7,7 @@
 from .metrics import MetricsCollector
 from .nvidia_smi_fallback import parse_nvidia_smi
 from .config import NVIDIA_SMI
+from .gpu_disconnect import is_gpu_simulated_offline
 
 logger = logging.getLogger(__name__)
 
@@ -19,6 +20,7 @@ def __init__(self):
         self.gpu_data = {}
         self.collector = MetricsCollector()
         self.use_smi = {}  # Track which GPUs use nvidia-smi (decided at boot)
+        self.last_device_count = None  # Track device count changes
 
         try:
             pynvml.nvmlInit()
@@ -87,6 +89,16 @@ async def get_gpu_data(self):
 
         try:
             device_count = pynvml.nvmlDeviceGetCount()
+            
+            # Log device count changes (indicates GPU disconnect/reconnect)
+            if self.last_device_count is not None and device_count != self.last_device_count:
+                logger.warning(f"[MONITOR] *** GPU DEVICE COUNT CHANGED: {self.last_device_count} -> {device_count} ***")
+                if device_count < self.last_device_count:
+                    logger.warning(f"[MONITOR] *** GPU(s) DISAPPEARED - {self.last_device_count - device_count} device(s) missing ***")
+                else:
+                    logger.info(f"[MONITOR] *** GPU(s) REAPPEARED - {device_count - self.last_device_count} device(s) added ***")
+            
+            self.last_device_count = device_count
             gpu_data = {}
 
             # Get nvidia-smi data once if any GPU needs it
@@ -104,6 +116,24 @@ async def get_gpu_data(self):
             tasks = []
             for i in range(device_count):
                 gpu_id = str(i)
+                
+                # Check if GPU is in simulated offline state
+                if is_gpu_simulated_offline(i):
+                    logger.debug(f"[MONITOR] GPU {i} is in simulated offline state - skipping")
+                    # Create offline data
+                    gpu_data[gpu_id] = {
+                        'index': gpu_id,
+                        'name': self.gpu_data.get(gpu_id, {}).get('name', 'Unknown GPU'),
+                        'simulated_offline': True,
+                        'status': 'Simulated Disconnect',
+                        'utilization': None,
+                        'memory_used': 0,
+                        'memory_total': 0,
+                        'temperature': None,
+                        'power_draw': None,
+                    }
+                    continue
+                
                 if self.use_smi.get(gpu_id, False):
                     # Use nvidia-smi data
                     if smi_data and gpu_id in smi_data:
@@ -141,8 +171,16 @@ def _collect_single_gpu(self, gpu_index):
         try:
             handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
             return self.collector.collect_all(handle, str(gpu_index))
+        except pynvml.NVMLError as e:
+            # NVML-specific errors might indicate GPU is disconnected
+            error_str = str(e)
+            if "Not Found" in error_str or "Unknown Error" in error_str or "GPU is lost" in error_str:
+                logger.warning(f"[MONITOR] GPU {gpu_index}: Cannot access GPU - may be disconnected ({error_str})")
+            else:
+                logger.error(f"[MONITOR] GPU {gpu_index}: NVML Error - {e}")
+            return {}
         except Exception as e:
-            logger.error(f"GPU {gpu_index}: Error - {e}")
+            logger.error(f"[MONITOR] GPU {gpu_index}: Unexpected error - {e}")
             return {}
 
     async def get_processes(self):
diff --git a/test_quick_validation.py b/test_quick_validation.py
deleted file mode 100644
index d76cdf6..0000000
--- a/test_quick_validation.py
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-"""
-Quick validation script - Test GPU disconnect functionality
-Run this to verify the implementation works on your system
-"""
-
-import sys
-import time
-import asyncio
-
-print("="*80)
-print("GPU DISCONNECT FUNCTIONALITY - QUICK VALIDATION TEST")
-print("="*80)
-print()
-
-# Check 1: Verify all modules can be imported
-print("✓ Step 1: Checking module imports...")
-try:
-    from core.gpu_disconnect import gpu_disconnector, DisconnectMethod
-    print("  ✓ GPU disconnect module loaded")
-except ImportError as e:
-    print(f"  ✗ Failed to import gpu_disconnect: {e}")
-    sys.exit(1)
-
-try:
-    from core.gpu_test_workloads import workload_manager, WorkloadType, TORCH_AVAILABLE
-    print("  ✓ GPU workload module loaded")
-except ImportError as e:
-    print(f"  ✗ Failed to import gpu_test_workloads: {e}")
-    sys.exit(1)
-
-try:
-    from tests.test_gpu_disconnect_integration import (
-        create_basic_disconnect_test,
-        create_standard_test_suite
-    )
-    print("  ✓ Integration test module loaded")
-except ImportError as e:
-    print(f"  ✗ Failed to import integration tests: {e}")
-    sys.exit(1)
-
-print()
-
-# Check 2: Verify PyTorch availability
-print("✓ Step 2: Checking GPU libraries...")
-if TORCH_AVAILABLE:
-    import torch
-    gpu_count = torch.cuda.device_count()
-    print(f"  ✓ PyTorch CUDA available: {gpu_count} GPU(s) detected")
-    if gpu_count > 0:
-        for i in range(gpu_count):
-            name = torch.cuda.get_device_name(i)
-            print(f"    - GPU {i}: {name}")
-else:
-    print("  ⚠ PyTorch CUDA not available")
-    print("    Install with: pip install torch --index-url https://download.pytorch.org/whl/cu118")
-    print("    Continuing with limited functionality...")
-
-print()
-
-# Check 3: Test workload creation
-print("✓ Step 3: Testing workload creation...")
-try:
-    workload_id = workload_manager.create_workload(
-        gpu_id=0,
-        workload_type=WorkloadType.COMPUTE_INTENSIVE,
-        duration=5.0
-    )
-    print(f"  ✓ Created test workload: {workload_id}")
-    
-    # Get status
-    status = workload_manager.get_workload_status(workload_id)
-    print(f"  ✓ Workload status: {status['status']}")
-    
-except Exception as e:
-    print(f"  ✗ Failed to create workload: {e}")
-    sys.exit(1)
-
-print()
-
-# Check 4: Test disconnect capability detection
-print("✓ Step 4: Checking disconnect capabilities...")
-async def check_disconnect():
-    try:
-        methods = await gpu_disconnector.get_available_methods(0)
-        print(f"  ✓ Available disconnect methods: {', '.join(methods)}")
-        return True
-    except Exception as e:
-        print(f"  ⚠ Could not detect methods: {e}")
-        print("    This is expected if not running as root")
-        return False
-
-has_disconnect = asyncio.run(check_disconnect())
-
-print()
-
-# Check 5: Run a simple test (if PyTorch available)
-if TORCH_AVAILABLE and gpu_count > 0:
-    print("✓ Step 5: Running quick GPU workload test...")
-    try:
-        # Start the workload
-        workload_manager.start_workload(workload_id)
-        print(f"  ✓ Started workload on GPU 0")
-        
-        # Monitor for a few seconds
-        for i in range(3):
-            time.sleep(1)
-            status = workload_manager.get_workload_status(workload_id)
-            print(f"  ✓ Progress: {status['progress']:.1f}% "
-                  f"({status['iterations_completed']} iterations, "
-                  f"status: {status['status']})")
-        
-        # Stop it
-        workload_manager.stop_workload(workload_id)
-        final_status = workload_manager.get_workload_status(workload_id)
-        print(f"  ✓ Workload stopped: {final_status['status']}")
-        
-    except Exception as e:
-        print(f"  ✗ Workload test failed: {e}")
-        import traceback
-        traceback.print_exc()
-else:
-    print("⊘ Step 5: Skipping workload test (PyTorch/CUDA not available)")
-
-print()
-
-# Check 6: Test integration test creation
-print("✓ Step 6: Testing integration test framework...")
-try:
-    test = create_basic_disconnect_test(gpu_id=0)
-    print(f"  ✓ Created test: {test.name}")
-    print(f"    Description: {test.description}")
-    print(f"    Workload: {test.workload_type.value}")
-    print(f"    Duration: {test.workload_duration}s")
-except Exception as e:
-    print(f"  ✗ Failed to create integration test: {e}")
-    sys.exit(1)
-
-print()
-
-# Summary
-print("="*80)
-print("VALIDATION SUMMARY")
-print("="*80)
-print()
-
-all_checks = [
-    ("Module imports", True),
-    ("PyTorch CUDA", TORCH_AVAILABLE),
-    ("Workload creation", True),
-    ("Disconnect detection", has_disconnect),
-    ("GPU workload execution", TORCH_AVAILABLE and gpu_count > 0),
-    ("Integration test framework", True)
-]
-
-passed = sum(1 for _, result in all_checks if result)
-total = len(all_checks)
-
-for check_name, result in all_checks:
-    symbol = "✓" if result else "⚠" if "PyTorch" in check_name else "✗"
-    status = "PASS" if result else "WARN" if "PyTorch" in check_name else "FAIL"
-    print(f"{symbol} {check_name}: {status}")
-
-print()
-print(f"Results: {passed}/{total} checks passed")
-print()
-
-if not TORCH_AVAILABLE:
-    print("⚠ WARNING: PyTorch CUDA not available")
-    print("  The framework is installed but cannot run GPU workloads")
-    print("  Install PyTorch with CUDA:")
-    print("  pip install torch --index-url https://download.pytorch.org/whl/cu118")
-    print()
-
-if not has_disconnect:
-    print("⚠ WARNING: Disconnect capabilities limited")
-    print("  This is normal if not running as root or in WSL2")
-    print("  For full disconnect testing, run with sudo on bare-metal Linux")
-    print()
-
-# Next steps
-print("="*80)
-print("NEXT STEPS")
-print("="*80)
-print()
-print("1. Start the application:")
-print("   docker-compose up --build")
-print()
-print("2. Test via Web UI:")
-print("   Open http://localhost:1312")
-print("   - Click disconnect button on any GPU")
-print("   - Select method and duration")
-print()
-print("3. Run full integration tests:")
-print("   cd tests")
-print("   sudo python3 test_gpu_disconnect_integration.py")
-print()
-print("4. Test via API:")
-print("   # Create workload")
-print("   curl -X POST http://localhost:1312/api/gpu/workload/create \\")
-print("     -H 'Content-Type: application/json' \\")
-print("     -d '{\"gpu_id\": 0, \"workload_type\": \"compute_intensive\", \"duration\": 30}'")
-print()
-print("   # Start workload (use workload_id from response)")
-print("   curl -X POST http://localhost:1312/api/gpu/workload/<ID>/start")
-print()
-print("   # Trigger disconnect while running")
-print("   curl -X POST http://localhost:1312/api/gpu/0/disconnect \\")
-print("     -H 'Content-Type: application/json' \\")
-print("     -d '{\"method\": \"auto\", \"down_time\": 5}'")
-print()
-print("   # Check workload status (should be interrupted)")
-print("   curl http://localhost:1312/api/gpu/workload/<ID>/status")
-print()
-print("="*80)
-print()
-
-if passed == total:
-    print("✓ ALL SYSTEMS GO! The implementation is ready to use.")
-    sys.exit(0)
-elif passed >= total - 1:
-    print("⚠ MOSTLY READY - Some optional features unavailable")
-    sys.exit(0)
-else:
-    print("✗ ISSUES DETECTED - Please review warnings above")
-    sys.exit(1)
-
diff --git a/tests/README.md b/tests/README.md
deleted file mode 100644
index 2dfa08d..0000000
--- a/tests/README.md
+++ /dev/null
@@ -1,243 +0,0 @@
-# GPU Disconnect Integration Tests
-
-This directory contains comprehensive integration tests for GPU disconnect functionality.
-
-## Quick Start
-
-### Run Full Test Suite
-```bash
-cd tests
-python3 test_gpu_disconnect_integration.py
-```
-
-This will run a complete suite of disconnect tests including:
-- Basic disconnect during compute workload
-- Memory stress test with disconnect
-- Immediate disconnect after workload start  
-- Continuous workload disconnect
-
-## Requirements
-
-### System Requirements
-- **Linux** with PCI sysfs (`/sys/bus/pci/devices`)
-- **Root privileges** (for actual GPU disconnect)
-- **NVIDIA GPU** with drivers installed
-- **PyTorch with CUDA** support
-
-### Python Dependencies
-```bash
-pip install torch --index-url https://download.pytorch.org/whl/cu118
-```
-
-Or use the Docker container which includes all dependencies.
-
-## Test Components
-
-### 1. GPU Workload Generator (`core/gpu_test_workloads.py`)
-Generates various GPU workloads for testing:
-
-**Workload Types:**
-- `MEMORY_STRESS` - Rapid memory allocation/deallocation
-- `COMPUTE_INTENSIVE` - Matrix multiplications and heavy compute
-- `LONG_RUNNING` - Single long operation with many iterations
-- `CONTINUOUS` - Rapid small operations in tight loop
-- `MIXED` - Combination of memory and compute operations
-
-### 2. Integration Test Framework (`test_gpu_disconnect_integration.py`)
-Orchestrates complete test scenarios:
-
-**Test Phases:**
-1. **Start Workload** - Begin GPU operation
-2. **Monitor** - Track workload progress
-3. **Disconnect** - Trigger GPU disconnect
-4. **Validate** - Verify expected behavior
-
-**Expected Results:**
-- Workload interrupted or fails during disconnect
-- CUDA errors captured appropriately
-- GPU unavailable during disconnect period
-- GPU recovers after reconnect
-
-### 3. Pre-configured Test Scenarios
-Ready-to-use test configurations:
-
-```python
-from tests.test_gpu_disconnect_integration import (
-    create_basic_disconnect_test,
-    create_memory_stress_disconnect_test,
-    create_immediate_disconnect_test,
-    create_continuous_workload_test,
-    create_standard_test_suite
-)
-
-# Run single test
-test = create_basic_disconnect_test(gpu_id=0)
-result = await test.run()
-
-# Run full suite
-suite = create_standard_test_suite(gpu_id=0)
-results = await suite.run_all()
-```
-
-## Manual Testing with API
-
-You can also test via the REST API when the application is running:
-
-### 1. Create and Start Workload
-```bash
-# Create workload
-curl -X POST http://localhost:1312/api/gpu/workload/create \
-  -H "Content-Type: application/json" \
-  -d '{"gpu_id": 0, "workload_type": "compute_intensive", "duration": 30.0}'
-
-# Response includes workload_id
-# {"workload_id": "workload_1_1234567890", ...}
-
-# Start the workload
-curl -X POST http://localhost:1312/api/gpu/workload/workload_1_1234567890/start
-```
-
-### 2. Monitor Workload
-```bash
-# Check workload status
-curl http://localhost:1312/api/gpu/workload/workload_1_1234567890/status
-
-# List all workloads
-curl http://localhost:1312/api/gpu/workloads
-```
-
-### 3. Trigger Disconnect During Workload
-```bash
-# While workload is running, trigger disconnect
-curl -X POST http://localhost:1312/api/gpu/0/disconnect \
-  -H "Content-Type: application/json" \
-  -d '{"method": "auto", "down_time": 5.0}'
-```
-
-### 4. Check Results
-```bash
-# Check final workload status
-curl http://localhost:1312/api/gpu/workload/workload_1_1234567890/status
-
-# Expected: status should be "interrupted" or "failed"
-```
-
-## Test Validation Criteria
-
-### Successful Disconnect Test:
-✅ Workload starts successfully  
-✅ Disconnect operation completes  
-✅ Workload is interrupted/fails during disconnect  
-✅ GPU becomes unavailable (nvidia-smi shows error)  
-✅ GPU recovers after reconnect  
-✅ New operations can be scheduled after recovery  
-
-### Expected Behaviors:
-
-**During Disconnect:**
-- Running CUDA operations fail with errors
-- New operations cannot be scheduled
-- `nvidia-smi` reports GPU unavailable
-- Workload status changes to `interrupted` or `failed`
-
-**After Reconnect:**
-- GPU reappears in system
-- New workloads can be created
-- Operations complete successfully
-- No memory leaks or resource issues
-
-## Troubleshooting
-
-### "PyTorch CUDA not available"
-Install PyTorch with CUDA support:
-```bash
-pip install torch --index-url https://download.pytorch.org/whl/cu118
-```
-
-### "Permission denied" during disconnect
-Tests require root privileges for actual GPU disconnect:
-```bash
-sudo python3 test_gpu_disconnect_integration.py
-```
-
-### "Workload completed despite disconnect"
-This indicates the disconnect didn't actually affect the GPU. Possible causes:
-- Insufficient privileges (need root)
-- WSL2 limitations (use bare metal Linux)
-- Disconnect method not supported on platform
-
-### Tests pass but you want to verify manually
-Check system logs during test:
-```bash
-# Terminal 1: Run tests
-sudo python3 test_gpu_disconnect_integration.py
-
-# Terminal 2: Watch GPU status
-watch -n 0.5 nvidia-smi
-
-# Terminal 3: Monitor kernel messages
-sudo dmesg -w | grep -i gpu
-```
-
-## Advanced Usage
-
-### Custom Test Scenario
-```python
-from tests.test_gpu_disconnect_integration import DisconnectTestScenario
-from core.gpu_test_workloads import WorkloadType
-
-# Create custom test
-test = DisconnectTestScenario(
-    test_id="custom_test_1",
-    name="Custom Stress Test",
-    description="My custom disconnect scenario",
-    gpu_id=0,
-    workload_type=WorkloadType.MEMORY_STRESS,
-    workload_duration=60.0,      # 60 second workload
-    disconnect_delay=10.0,        # Disconnect after 10s
-    disconnect_method="logical",  # Force logical method
-    disconnect_duration=15.0      # Keep disconnected for 15s
-)
-
-result = await test.run()
-print(result)
-```
-
-### Multi-GPU Testing
-```python
-# Test on different GPUs
-suite = DisconnectTestSuite("Multi-GPU Tests")
-
-for gpu_id in [0, 1, 2, 3]:
-    test = create_basic_disconnect_test(gpu_id=gpu_id)
-    suite.add_test(test)
-
-results = await suite.run_all()
-```
-
-## CI/CD Integration
-
-For automated testing in CI/CD pipelines:
-
-```bash
-# Run tests with JSON output
-python3 test_gpu_disconnect_integration.py --json > results.json
-
-# Check exit code
-if [ $? -eq 0 ]; then
-    echo "All tests passed"
-else
-    echo "Tests failed"
-    exit 1
-fi
-```
-
-## WSL2 / Limited Environments
-
-In WSL2 or environments without full PCI access, tests will:
-- Execute workloads successfully ✅
-- Attempt disconnect operations ✅
-- Report permission errors (expected) ⚠️
-- Still validate UI/API functionality ✅
-
-This allows partial validation even without hardware disconnect capability.
\ No newline at end of file
diff --git a/tests/test_gpu_disconnect_integration.py b/tests/test_gpu_disconnect_integration.py
deleted file mode 100644
index 1ed1e51..0000000
--- a/tests/test_gpu_disconnect_integration.py
+++ /dev/null
@@ -1,407 +0,0 @@
-#!/usr/bin/env python3
-"""
-GPU Disconnect Integration Tests
-Orchestrates workloads, triggers disconnects, and validates results
-"""
-
-import asyncio
-import logging
-import time
-from datetime import datetime
-from typing import Dict, List, Optional
-from enum import Enum
-
-import sys
-sys.path.insert(0, '../')
-
-from core.gpu_test_workloads import (
-    WorkloadType, WorkloadStatus, workload_manager, TORCH_AVAILABLE
-)
-from core.gpu_disconnect import gpu_disconnector, DisconnectMethod, GPUDisconnectError
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class TestStatus(Enum):
-    """Status of a disconnect test"""
-    PENDING = "pending"
-    RUNNING = "running"
-    PASSED = "passed"
-    FAILED = "failed"
-    ERROR = "error"
-
-
-class DisconnectTestScenario:
-    """Represents a single disconnect test scenario"""
-    
-    def __init__(
-        self,
-        test_id: str,
-        name: str,
-        description: str,
-        gpu_id: int,
-        workload_type: WorkloadType = WorkloadType.COMPUTE_INTENSIVE,
-        workload_duration: float = 15.0,
-        disconnect_delay: float = 3.0,
-        disconnect_method: str = "auto",
-        disconnect_duration: float = 5.0
-    ):
-        self.test_id = test_id
-        self.name = name
-        self.description = description
-        self.gpu_id = gpu_id
-        self.workload_type = workload_type
-        self.workload_duration = workload_duration
-        self.disconnect_delay = disconnect_delay
-        self.disconnect_method = disconnect_method
-        self.disconnect_duration = disconnect_duration
-        
-        self.status = TestStatus.PENDING
-        self.start_time = None
-        self.end_time = None
-        self.workload_id = None
-        self.workload_status_before = None
-        self.workload_status_during = None
-        self.workload_status_after = None
-        self.disconnect_result = None
-        self.errors = []
-        self.logs = []
-    
-    async def run(self) -> Dict:
-        """Execute the test scenario"""
-        self.status = TestStatus.RUNNING
-        self.start_time = datetime.now()
-        self.log(f"Starting test: {self.name}")
-        
-        try:
-            # Phase 1: Start GPU workload
-            self.log(f"Phase 1: Starting {self.workload_type.value} workload on GPU {self.gpu_id}")
-            self.workload_id = workload_manager.create_workload(
-                gpu_id=self.gpu_id,
-                workload_type=self.workload_type,
-                duration=self.workload_duration
-            )
-            workload_manager.start_workload(self.workload_id)
-            
-            # Wait a bit for workload to get going
-            await asyncio.sleep(1.0)
-            self.workload_status_before = workload_manager.get_workload_status(self.workload_id)
-            self.log(f"Workload started: {self.workload_status_before['iterations_completed']} iterations")
-            
-            # Phase 2: Wait before disconnect
-            if self.disconnect_delay > 0:
-                self.log(f"Phase 2: Waiting {self.disconnect_delay}s before disconnect")
-                await asyncio.sleep(self.disconnect_delay)
-                self.workload_status_during = workload_manager.get_workload_status(self.workload_id)
-                self.log(f"Workload progress: {self.workload_status_during['progress']:.1f}% "
-                        f"({self.workload_status_during['iterations_completed']} iterations)")
-            
-            # Phase 3: Trigger disconnect
-            self.log(f"Phase 3: Triggering GPU {self.gpu_id} disconnect using {self.disconnect_method}")
-            disconnect_start = time.time()
-            
-            try:
-                self.disconnect_result = await gpu_disconnector.disconnect_gpu(
-                    gpu_index=self.gpu_id,
-                    method=DisconnectMethod(self.disconnect_method),
-                    down_time=self.disconnect_duration
-                )
-                disconnect_elapsed = time.time() - disconnect_start
-                self.log(f"Disconnect completed in {disconnect_elapsed:.2f}s: {self.disconnect_result.get('message', 'OK')}")
-                
-            except GPUDisconnectError as e:
-                self.log(f"Disconnect operation failed: {e}", level=logging.ERROR)
-                self.errors.append(f"Disconnect failed: {e}")
-                self.disconnect_result = {'success': False, 'error': str(e)}
-            
-            # Phase 4: Check workload status after disconnect
-            await asyncio.sleep(1.0)
-            self.workload_status_after = workload_manager.get_workload_status(self.workload_id)
-            self.log(f"Workload final status: {self.workload_status_after['status']} "
-                    f"({self.workload_status_after['iterations_completed']} iterations)")
-            
-            # Phase 5: Validate results
-            self.log("Phase 5: Validating test results")
-            validation = self.validate_results()
-            
-            if validation['passed']:
-                self.status = TestStatus.PASSED
-                self.log("✓ Test PASSED")
-            else:
-                self.status = TestStatus.FAILED
-                self.log(f"✗ Test FAILED: {validation['reason']}")
-                self.errors.append(validation['reason'])
-            
-        except Exception as e:
-            self.status = TestStatus.ERROR
-            self.log(f"Test ERROR: {e}", level=logging.ERROR)
-            self.errors.append(str(e))
-            
-        finally:
-            self.end_time = datetime.now()
-            # Clean up workload
-            if self.workload_id:
-                try:
-                    workload_manager.stop_workload(self.workload_id)
-                except:
-                    pass
-        
-        return self.get_result()
-    
-    def validate_results(self) -> Dict:
-        """Validate that the test behaved as expected"""
-        # Expected behavior: workload should be interrupted or fail during disconnect
-        
-        if not self.workload_status_after:
-            return {'passed': False, 'reason': 'No workload status available after disconnect'}
-        
-        # Check if disconnect succeeded
-        if not self.disconnect_result or not self.disconnect_result.get('success'):
-            # If disconnect failed, test is inconclusive but not necessarily failed
-            # (might be testing in an environment without proper permissions)
-            return {
-                'passed': True,  # Pass but note the limitation
-                'reason': 'Disconnect operation failed (expected in limited environments)',
-                'note': 'Could not validate actual GPU disconnect behavior'
-            }
-        
-        # If disconnect succeeded, workload should be interrupted or failed
-        workload_final_status = self.workload_status_after['status']
-        
-        # Expected: workload interrupted, failed, or didn't complete all iterations
-        if workload_final_status in ['interrupted', 'failed']:
-            return {
-                'passed': True,
-                'reason': f'Workload correctly {workload_final_status} during disconnect'
-            }
-        
-        # Check if workload completed but didn't finish all expected iterations
-        if workload_final_status == 'completed':
-            completed = self.workload_status_after['iterations_completed']
-            expected = self.workload_status_after.get('expected_iterations', 100)
-            
-            if completed < expected:
-                return {
-                    'passed': True,
-                    'reason': f'Workload interrupted early ({completed}/{expected} iterations)'
-                }
-            else:
-                return {
-                    'passed': False,
-                    'reason': 'Workload completed all iterations despite disconnect (disconnect may not have affected GPU)'
-                }
-        
-        return {
-            'passed': True,
-            'reason': 'Test completed with expected behavior'
-        }
-    
-    def log(self, message: str, level=logging.INFO):
-        """Log a message"""
-        timestamp = datetime.now().isoformat()
-        log_entry = f"[{timestamp}] {message}"
-        self.logs.append(log_entry)
-        logger.log(level, f"[{self.test_id}] {message}")
-    
-    def get_result(self) -> Dict:
-        """Get test results"""
-        duration = None
-        if self.start_time and self.end_time:
-            duration = (self.end_time - self.start_time).total_seconds()
-        
-        return {
-            'test_id': self.test_id,
-            'name': self.name,
-            'description': self.description,
-            'status': self.status.value,
-            'duration_seconds': duration,
-            'gpu_id': self.gpu_id,
-            'workload_type': self.workload_type.value,
-            'disconnect_method': self.disconnect_method,
-            'workload_before': self.workload_status_before,
-            'workload_during': self.workload_status_during,
-            'workload_after': self.workload_status_after,
-            'disconnect_result': self.disconnect_result,
-            'errors': self.errors,
-            'logs': self.logs,
-            'start_time': self.start_time.isoformat() if self.start_time else None,
-            'end_time': self.end_time.isoformat() if self.end_time else None
-        }
-
-
-class DisconnectTestSuite:
-    """Collection of test scenarios"""
-    
-    def __init__(self, suite_name: str):
-        self.suite_name = suite_name
-        self.tests: List[DisconnectTestScenario] = []
-        self.start_time = None
-        self.end_time = None
-    
-    def add_test(self, test: DisconnectTestScenario):
-        """Add a test to the suite"""
-        self.tests.append(test)
-    
-    async def run_all(self) -> Dict:
-        """Run all tests in the suite"""
-        self.start_time = datetime.now()
-        logger.info(f"Starting test suite: {self.suite_name} ({len(self.tests)} tests)")
-        
-        results = []
-        passed = 0
-        failed = 0
-        errors = 0
-        
-        for test in self.tests:
-            logger.info(f"Running test {len(results) + 1}/{len(self.tests)}: {test.name}")
-            result = await test.run()
-            results.append(result)
-            
-            if result['status'] == 'passed':
-                passed += 1
-            elif result['status'] == 'failed':
-                failed += 1
-            elif result['status'] == 'error':
-                errors += 1
-            
-            # Brief pause between tests
-            await asyncio.sleep(2.0)
-        
-        self.end_time = datetime.now()
-        duration = (self.end_time - self.start_time).total_seconds()
-        
-        summary = {
-            'suite_name': self.suite_name,
-            'total_tests': len(self.tests),
-            'passed': passed,
-            'failed': failed,
-            'errors': errors,
-            'duration_seconds': duration,
-            'tests': results,
-            'start_time': self.start_time.isoformat(),
-            'end_time': self.end_time.isoformat()
-        }
-        
-        logger.info(f"Test suite completed: {passed} passed, {failed} failed, {errors} errors")
-        return summary
-
-
-# Pre-configured test scenarios
-
-def create_basic_disconnect_test(gpu_id: int = 0) -> DisconnectTestScenario:
-    """Basic disconnect test - compute workload + disconnect"""
-    return DisconnectTestScenario(
-        test_id=f"basic_disconnect_gpu{gpu_id}_{int(time.time())}",
-        name="Basic Disconnect Test",
-        description="Start compute workload, wait, then disconnect GPU",
-        gpu_id=gpu_id,
-        workload_type=WorkloadType.COMPUTE_INTENSIVE,
-        workload_duration=15.0,
-        disconnect_delay=3.0,
-        disconnect_method="auto",
-        disconnect_duration=5.0
-    )
-
-
-def create_memory_stress_disconnect_test(gpu_id: int = 0) -> DisconnectTestScenario:
-    """Memory stress disconnect test"""
-    return DisconnectTestScenario(
-        test_id=f"memory_disconnect_gpu{gpu_id}_{int(time.time())}",
-        name="Memory Stress Disconnect Test",
-        description="Memory allocation stress test during disconnect",
-        gpu_id=gpu_id,
-        workload_type=WorkloadType.MEMORY_STRESS,
-        workload_duration=20.0,
-        disconnect_delay=4.0,
-        disconnect_method="auto",
-        disconnect_duration=5.0
-    )
-
-
-def create_immediate_disconnect_test(gpu_id: int = 0) -> DisconnectTestScenario:
-    """Immediate disconnect test - disconnect right after workload starts"""
-    return DisconnectTestScenario(
-        test_id=f"immediate_disconnect_gpu{gpu_id}_{int(time.time())}",
-        name="Immediate Disconnect Test",
-        description="Disconnect GPU immediately after workload starts",
-        gpu_id=gpu_id,
-        workload_type=WorkloadType.LONG_RUNNING,
-        workload_duration=30.0,
-        disconnect_delay=1.0,
-        disconnect_method="logical",
-        disconnect_duration=3.0
-    )
-
-
-def create_continuous_workload_test(gpu_id: int = 0) -> DisconnectTestScenario:
-    """Continuous workload disconnect test"""
-    return DisconnectTestScenario(
-        test_id=f"continuous_disconnect_gpu{gpu_id}_{int(time.time())}",
-        name="Continuous Workload Disconnect",
-        description="Continuous rapid operations during disconnect",
-        gpu_id=gpu_id,
-        workload_type=WorkloadType.CONTINUOUS,
-        workload_duration=25.0,
-        disconnect_delay=5.0,
-        disconnect_method="auto",
-        disconnect_duration=7.0
-    )
-
-
-def create_standard_test_suite(gpu_id: int = 0) -> DisconnectTestSuite:
-    """Create standard test suite with common scenarios"""
-    suite = DisconnectTestSuite(f"Standard Disconnect Tests (GPU {gpu_id})")
-    
-    suite.add_test(create_basic_disconnect_test(gpu_id))
-    suite.add_test(create_memory_stress_disconnect_test(gpu_id))
-    suite.add_test(create_immediate_disconnect_test(gpu_id))
-    suite.add_test(create_continuous_workload_test(gpu_id))
-    
-    return suite
-
-
-# Main test execution
-async def main():
-    """Run test suite"""
-    if not TORCH_AVAILABLE:
-        logger.error("PyTorch with CUDA not available - cannot run GPU tests")
-        logger.info("Install PyTorch with CUDA support: pip install torch --index-url https://download.pytorch.org/whl/cu118")
-        return
-    
-    import torch
-    gpu_count = torch.cuda.device_count()
-    logger.info(f"Found {gpu_count} GPU(s) available for testing")
-    
-    if gpu_count == 0:
-        logger.error("No GPUs available for testing")
-        return
-    
-    # Run standard test suite on GPU 0
-    suite = create_standard_test_suite(gpu_id=0)
-    results = await suite.run_all()
-    
-    # Print summary
-    print("\n" + "="*80)
-    print(f"Test Suite: {results['suite_name']}")
-    print("="*80)
-    print(f"Total Tests: {results['total_tests']}")
-    print(f"Passed: {results['passed']}")
-    print(f"Failed: {results['failed']}")
-    print(f"Errors: {results['errors']}")
-    print(f"Duration: {results['duration_seconds']:.2f}s")
-    print("="*80)
-    
-    # Print individual test results
-    for test in results['tests']:
-        status_symbol = "✓" if test['status'] == 'passed' else "✗"
-        print(f"{status_symbol} {test['name']}: {test['status'].upper()}")
-        if test['errors']:
-            for error in test['errors']:
-                print(f"  Error: {error}")
-    
-    print("="*80)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())

From d019e0f78ab8156c5a45615f1a220ada5f57ef9b Mon Sep 17 00:00:00 2001
From: SpyrosMouselinos <mouselinos.spur.kw@gmail.com>
Date: Thu, 23 Oct 2025 14:24:42 +0200
Subject: [PATCH 4/5] Debloat

---
 core/gpu_disconnect.py | 170 ++++++++++++++++++++---------------------
 core/handlers.py       |   2 +-
 requirements.txt       |   3 +-
 3 files changed, 86 insertions(+), 89 deletions(-)

diff --git a/core/gpu_disconnect.py b/core/gpu_disconnect.py
index e5084b6..1a660ee 100644
--- a/core/gpu_disconnect.py
+++ b/core/gpu_disconnect.py
@@ -449,75 +449,40 @@ async def _hot_reset_disconnect(self, bdf: str, down_time: float):
 
     async def _logical_disconnect(self, bdf: str, down_time: float):
         """Execute logical disconnect (remove/rescan)"""
-        logger.info(f"[DISCONNECT START] GPU {bdf} - target down_time: {down_time}s")
+        logger.info(f"Executing logical disconnect for {bdf}")
         
         device_path = SYSFS_PCI_DEVICES / bdf
         
-        # Log state before removal
-        try:
-            nvml_count_pre = pynvml.nvmlDeviceGetCount()
-        except Exception as e:
-            nvml_count_pre = f"Error: {e}"
-        
-        logger.info(f"[PRE-REMOVE] Device path exists: {device_path.exists()}")
-        logger.info(f"[PRE-REMOVE] NVML device count: {nvml_count_pre}")
-        
         # Unbind and remove
         await self._unbind_driver(bdf)
-        logger.info(f"[REMOVE] Writing '1' to {device_path / 'remove'}")
         await self._write_sysfs(device_path / "remove", "1")
         
-        # Wait briefly for removal to take effect, then verify
+        # Wait briefly for removal to take effect
         await asyncio.sleep(0.5)
         
-        try:
-            nvml_count_post = pynvml.nvmlDeviceGetCount()
-        except Exception as e:
-            nvml_count_post = f"Error: {e}"
-        
-        logger.info(f"[POST-REMOVE] Device path exists: {device_path.exists()}")
-        logger.info(f"[POST-REMOVE] NVML device count: {nvml_count_post}")
-        
         if device_path.exists():
-            logger.warning(f"[POST-REMOVE] WARNING: Device {bdf} still exists after removal!")
-        else:
-            logger.info(f"[POST-REMOVE] Confirmed: Device {bdf} successfully removed from PCI bus")
+            logger.warning(f"Device {bdf} still exists after removal - may not be properly disconnected")
         
         # Sleep for down_time
-        sleep_start = time.time()
-        logger.info(f"[SLEEP START] Sleeping for {down_time}s to simulate disconnect")
         await asyncio.sleep(down_time)
-        sleep_duration = time.time() - sleep_start
-        logger.info(f"[SLEEP END] Actual sleep duration: {sleep_duration:.2f}s")
         
         # Rescan PCI bus
-        logger.info(f"[RESCAN] Triggering PCI bus rescan")
         await self._write_sysfs(SYSFS_PCI_RESCAN, "1")
         
         # Wait for device to reappear
-        logger.info(f"[RESCAN] Waiting for {bdf} to reappear (timeout: 30s)")
         await self._wait_for_condition(
             lambda: (SYSFS_PCI_DEVICES / bdf).exists(),
             timeout=30,
             description=f"{bdf} to reappear"
         )
-        
-        # Verify reconnection
-        try:
-            nvml_count_final = pynvml.nvmlDeviceGetCount()
-        except Exception as e:
-            nvml_count_final = f"Error: {e}"
-        
-        logger.info(f"[POST-RESCAN] Device path exists: {device_path.exists()}")
-        logger.info(f"[POST-RESCAN] NVML device count: {nvml_count_final}")
-        logger.info(f"[DISCONNECT END] GPU {bdf} reconnected successfully")
 
     async def _nvidia_reset_disconnect(self, bdf: str, down_time: float, gpu_index: int = None):
         """Execute NVIDIA GPU reset using nvidia-smi"""
-        logger.info(f"[NVIDIA-RESET] Resetting GPU {gpu_index if gpu_index is not None else 'unknown'} ({bdf})")
+        # Find GPU index from BDF if not provided
+        if gpu_index is None:
+            gpu_index = await self._get_gpu_index_from_bdf(bdf)
         
-        # Find GPU index from BDF
-        gpu_index = await self._get_gpu_index_from_bdf(bdf)
+        logger.info(f"Executing NVIDIA reset for GPU {gpu_index}")
         
         result = await asyncio.create_subprocess_exec(
             'nvidia-smi', '--gpu-reset', '-i', str(gpu_index),
@@ -589,7 +554,6 @@ async def _unbind_driver(self, bdf: str):
                 unbind_file = Path(f"/sys/bus/pci/drivers/{driver_name}/unbind")
                 if unbind_file.exists():
                     await self._write_sysfs(unbind_file, bdf)
-                    logger.debug(f"Unbound driver {driver_name} from {bdf}")
         except Exception as e:
             logger.warning(f"Failed to unbind driver for {bdf}: {e}")
 
@@ -600,7 +564,6 @@ def write_sync():
                 path.write_text(value)
             
             await asyncio.get_event_loop().run_in_executor(None, write_sync)
-            logger.debug(f"Wrote '{value}' to {path}")
             
         except Exception as e:
             raise GPUDisconnectError(f"Failed to write to {path}: {e}")
@@ -617,76 +580,111 @@ async def _wait_for_condition(self, condition, timeout: int, description: str):
     
     async def _simulated_disconnect(self, gpu_index: int, down_time: float):
         """Simulate disconnect in software only - WSL2 safe"""
-        logger.info(f"[SIMULATED] Marking GPU {gpu_index} as offline for {down_time}s")
-        logger.info(f"[SIMULATED] This is a software-only simulation - GPU remains physically available")
+        logger.info(f"Simulating disconnect for GPU {gpu_index} ({down_time}s)")
         
         # Add to simulated offline set
         _simulated_offline_gpus.add(gpu_index)
         
         try:
-            logger.info(f"[SIMULATED] GPU {gpu_index} now appears 'disconnected' to monitor")
             await asyncio.sleep(down_time)
         finally:
             # Remove from offline set
             if gpu_index in _simulated_offline_gpus:
                 _simulated_offline_gpus.remove(gpu_index)
-            logger.info(f"[SIMULATED] GPU {gpu_index} back online - disconnect simulation complete")
     
     async def _memory_flood_disconnect(self, gpu_index: int, down_time: float):
         """Flood GPU memory to trigger potential OOM/driver reset - EXPERIMENTAL"""
-        logger.warning(f"[MEMORY-FLOOD] Starting EXPERIMENTAL memory flood on GPU {gpu_index}")
-        logger.warning(f"[MEMORY-FLOOD] This may cause unpredictable behavior or system instability!")
+        logger.warning(f"Starting EXPERIMENTAL memory flood on GPU {gpu_index} - may cause instability!")
         
-        try:
-            import torch
-        except ImportError:
-            raise GPUDisconnectError("PyTorch not available - memory flood requires torch")
+        import ctypes
+        
+        allocations = []
+        ctx = None
         
         try:
-            torch.cuda.set_device(gpu_index)
-            total_mem = torch.cuda.get_device_properties(gpu_index).total_memory
-            logger.info(f"[MEMORY-FLOOD] GPU {gpu_index} total memory: {total_mem / 1e9:.2f}GB")
+            # Load CUDA driver library
+            try:
+                libcuda = ctypes.CDLL('libcuda.so.1')
+            except OSError as e:
+                raise GPUDisconnectError(f"CUDA driver library not found: {e}")
             
-            allocations = []
+            # Define CUDA function signatures
+            cuInit = libcuda.cuInit
+            cuInit.argtypes = [ctypes.c_uint]
+            cuInit.restype = ctypes.c_int
+            
+            cuDeviceGet = libcuda.cuDeviceGet
+            cuDeviceGet.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int]
+            cuDeviceGet.restype = ctypes.c_int
+            
+            cuCtxCreate = libcuda.cuCtxCreate_v2
+            cuCtxCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_uint, ctypes.c_int]
+            cuCtxCreate.restype = ctypes.c_int
+            
+            cuCtxDestroy = libcuda.cuCtxDestroy_v2
+            cuCtxDestroy.argtypes = [ctypes.c_void_p]
+            cuCtxDestroy.restype = ctypes.c_int
+            
+            cuMemAlloc = libcuda.cuMemAlloc_v2
+            cuMemAlloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]
+            cuMemAlloc.restype = ctypes.c_int
+            
+            cuMemFree = libcuda.cuMemFree_v2
+            cuMemFree.argtypes = [ctypes.c_void_p]
+            cuMemFree.restype = ctypes.c_int
+            
+            # Initialize CUDA and create context
+            if cuInit(0) != 0:
+                raise GPUDisconnectError(f"CUDA initialization failed")
+            
+            device = ctypes.c_int()
+            if cuDeviceGet(ctypes.byref(device), gpu_index) != 0:
+                raise GPUDisconnectError(f"Failed to get CUDA device {gpu_index}")
+            
+            ctx = ctypes.c_void_p()
+            if cuCtxCreate(ctypes.byref(ctx), 0, device) != 0:
+                raise GPUDisconnectError(f"Failed to create CUDA context for GPU {gpu_index}")
+            
+            # Get GPU memory info
+            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            free_mem = mem_info.free
+            
+            # Allocate memory chunks
             allocated_bytes = 0
             chunk_size = 100 * 1024 * 1024  # 100MB chunks
+            target_bytes = int(free_mem * 0.95)
             
-            # Phase 1: Allocate until OOM
-            logger.info(f"[MEMORY-FLOOD] Phase 1: Allocating memory until OOM...")
-            try:
-                while allocated_bytes < total_mem * 0.95:  # Don't try to allocate 100%
-                    tensor = torch.empty(chunk_size // 4, dtype=torch.float32, device=f'cuda:{gpu_index}')
-                    allocations.append(tensor)
+            while allocated_bytes < target_bytes:
+                ptr = ctypes.c_void_p()
+                result = cuMemAlloc(ctypes.byref(ptr), chunk_size)
+                
+                if result == 0:
+                    allocations.append(ptr)
                     allocated_bytes += chunk_size
-                    
-                    if len(allocations) % 10 == 0:
-                        logger.debug(f"[MEMORY-FLOOD] Allocated {allocated_bytes / 1e9:.2f}GB")
-                        
-            except RuntimeError as e:
-                if "out of memory" in str(e).lower():
-                    logger.info(f"[MEMORY-FLOOD] OOM reached at {allocated_bytes / 1e9:.2f}GB: {e}")
                 else:
-                    raise
-            
-            # Phase 2: Hold memory for down_time
-            logger.info(f"[MEMORY-FLOOD] Phase 2: Holding {allocated_bytes / 1e9:.2f}GB for {down_time}s")
-            logger.info(f"[MEMORY-FLOOD] GPU {gpu_index} should be unresponsive during this time")
+                    break
             
+            logger.info(f"Allocated {allocated_bytes / 1e9:.2f}GB on GPU {gpu_index}, holding for {down_time}s")
             await asyncio.sleep(down_time)
             
         except Exception as e:
-            logger.error(f"[MEMORY-FLOOD] Error during memory flood: {e}")
+            logger.error(f"Memory flood error: {e}")
             raise
         finally:
-            # Phase 3: Release memory
-            logger.info(f"[MEMORY-FLOOD] Phase 3: Releasing memory...")
-            allocations.clear()
-            
-            if 'torch' in dir():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize(gpu_index)
-            
-            logger.info(f"[MEMORY-FLOOD] Memory flood complete - GPU {gpu_index} should recover")
+            # Release memory
+            for ptr in allocations:
+                try:
+                    cuMemFree(ptr)
+                except Exception:
+                    pass
+            
+            # Destroy CUDA context
+            if ctx and ctx.value:
+                try:
+                    cuCtxDestroy(ctx)
+                except Exception:
+                    pass
 
 
 # Global instance
diff --git a/core/handlers.py b/core/handlers.py
index 3ea26ab..997d137 100644
--- a/core/handlers.py
+++ b/core/handlers.py
@@ -230,7 +230,7 @@ async def get_disconnect_status():
                     "pci_disconnect": not in_wsl2 and sysfs_accessible,
                     "nvidia_reset": has_nvidia_smi,
                     "simulated": True,
-                    "memory_flood": has_nvidia_smi  # Needs torch/CUDA
+                    "memory_flood": True  # Uses ctypes + CUDA Driver API (zero dependencies)
                 },
                 "warnings": [w for w in warnings if w]
             }
diff --git a/requirements.txt b/requirements.txt
index 6699dc2..a7b7cc4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,5 +5,4 @@ psutil==5.9.6
 nvidia-ml-py==13.580.82
 requests==2.31.0
 websocket-client==1.6.3
-aiohttp==3.9.1
-torch==2.1.0
\ No newline at end of file
+aiohttp==3.9.1
\ No newline at end of file

From 62c9846d10de0e169250608f34779e4d4794c876 Mon Sep 17 00:00:00 2001
From: SpyrosMouselinos <mouselinos.spur.kw@gmail.com>
Date: Thu, 23 Oct 2025 15:56:33 +0200
Subject: [PATCH 5/5] Cosmetic Changes

---
 core/gpu_disconnect.py             | 48 +++++++++------
 static/css/disconnect-controls.css | 79 ++++++++++++++++++++-----
 static/js/gpu-cards.js             | 18 ------
 static/js/gpu-disconnect.js        | 94 +++++++++++++++---------------
 static/js/socket-handlers.js       | 12 ++++
 static/js/ui.js                    |  6 ++
 6 files changed, 157 insertions(+), 100 deletions(-)

diff --git a/core/gpu_disconnect.py b/core/gpu_disconnect.py
index 1a660ee..a3f0204 100644
--- a/core/gpu_disconnect.py
+++ b/core/gpu_disconnect.py
@@ -173,23 +173,33 @@ async def get_available_methods(self, gpu_index: int) -> List[str]:
         try:
             bdf = await self._get_gpu_bdf(gpu_index)
             
-            # Check slot power
-            if self._has_slot_power(bdf):
-                methods.append(DisconnectMethod.SLOT_POWER.value)
-            
-            # Check hot reset capability
-            if self._has_hot_reset_capability(bdf):
-                methods.append(DisconnectMethod.HOT_RESET.value)
-            
-            # Logical remove always available
-            methods.append(DisconnectMethod.LOGICAL.value)
-            
-            # NVIDIA reset (if nvidia-smi available)
-            if await self._has_nvidia_smi():
-                methods.append(DisconnectMethod.NVIDIA_RESET.value)
+            # In WSL2, only memory flood works (experimental)
+            if is_wsl2():
+                methods.append(DisconnectMethod.MEMORY_FLOOD.value)
+                logger.info("WSL2 detected - Only MEMORY_FLOOD available (experimental)")
+            else:
+                # Check slot power (Linux only)
+                if self._has_slot_power(bdf):
+                    methods.append(DisconnectMethod.SLOT_POWER.value)
+                
+                # Check hot reset capability (Linux only)
+                if self._has_hot_reset_capability(bdf):
+                    methods.append(DisconnectMethod.HOT_RESET.value)
+                
+                # Logical remove (Linux only)
+                methods.append(DisconnectMethod.LOGICAL.value)
+                
+                # NVIDIA reset (if nvidia-smi available)
+                if await self._has_nvidia_smi():
+                    methods.append(DisconnectMethod.NVIDIA_RESET.value)
+                
+                # Memory flood experimental method
+                methods.append(DisconnectMethod.MEMORY_FLOOD.value)
                 
         except Exception as e:
             logger.error(f"Error checking methods for GPU {gpu_index}: {e}")
+            # Fallback to memory flood if error
+            methods.append(DisconnectMethod.MEMORY_FLOOD.value)
         
         return methods
 
@@ -286,16 +296,16 @@ async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_tim
     async def _select_best_method(self, bdf: str, gpu_index: int = None) -> DisconnectMethod:
         """Select the best available method based on environment"""
         
-        # WSL2 detection - use soft methods
+        # WSL2 detection - use memory flood (experimental)
         if is_wsl2():
-            logger.info("WSL2 detected - using SIMULATED disconnect (PCI methods unavailable)")
-            return DisconnectMethod.SIMULATED
+            logger.info("WSL2 detected - using MEMORY_FLOOD disconnect (experimental)")
+            return DisconnectMethod.MEMORY_FLOOD
         
         # Native Linux - check PCI capabilities
         device_path = SYSFS_PCI_DEVICES / bdf
         if not device_path.exists():
-            logger.warning(f"PCI device {bdf} not accessible - falling back to SIMULATED")
-            return DisconnectMethod.SIMULATED
+            logger.warning(f"PCI device {bdf} not accessible - falling back to MEMORY_FLOOD")
+            return DisconnectMethod.MEMORY_FLOOD
         
         # Use real PCI methods in order of preference
         if self._has_slot_power(bdf):
diff --git a/static/css/disconnect-controls.css b/static/css/disconnect-controls.css
index ab2b5ca..8584a80 100644
--- a/static/css/disconnect-controls.css
+++ b/static/css/disconnect-controls.css
@@ -49,31 +49,62 @@
     border-top: 1px solid rgba(255, 255, 255, 0.1);
 }
 
-/* GPU Selection Checkbox */
-.gpu-select-container {
+/* GPU Disconnect Button (styled like ONLINE badge) */
+.gpu-disconnect-container {
     z-index: 10;
 }
 
-.gpu-select-container label {
+.gpu-disconnect-button {
+    padding: 0.75rem 1.5rem;
+    background: rgba(255, 107, 107, 0.15);
+    border: 2px solid rgba(255, 107, 107, 0.4);
+    border-radius: 30px;
+    font-size: 0.85rem;
+    font-weight: 700;
+    color: #ff6b6b;
     display: flex;
     align-items: center;
-    gap: 4px;
-    font-size: 0.85rem;
-    color: rgba(255, 255, 255, 0.8);
+    gap: 0.5rem;
+    letter-spacing: 1px;
+    box-shadow: 0 0 20px rgba(255, 107, 107, 0.3);
     cursor: pointer;
-    padding: 4px 8px;
-    border-radius: 4px;
-    background: rgba(0, 0, 0, 0.3);
-    transition: background 0.2s ease;
+    transition: all 0.3s ease;
+    text-transform: uppercase;
 }
 
-.gpu-select-container label:hover {
-    background: rgba(0, 0, 0, 0.5);
+.gpu-disconnect-button:hover {
+    background: rgba(255, 107, 107, 0.25);
+    border-color: rgba(255, 107, 107, 0.6);
+    box-shadow: 0 0 30px rgba(255, 107, 107, 0.5);
+    transform: translateY(-2px);
 }
 
-.gpu-select-checkbox {
-    margin: 0;
-    transform: scale(1.1);
+.gpu-disconnect-button:active {
+    transform: translateY(0);
+    box-shadow: 0 0 15px rgba(255, 107, 107, 0.4);
+}
+
+.disconnect-dot {
+    width: 8px;
+    height: 8px;
+    background: #ff6b6b;
+    border-radius: 50%;
+    display: inline-block;
+    box-shadow: 0 0 10px rgba(255, 107, 107, 0.8);
+    animation: pulse-disconnect 2s ease-in-out infinite;
+}
+
+@keyframes pulse-disconnect {
+    0%, 100% {
+        box-shadow: 0 0 10px rgba(255, 107, 107, 0.8);
+    }
+    50% {
+        box-shadow: 0 0 20px rgba(255, 107, 107, 1);
+    }
+}
+
+.disconnect-text {
+    text-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
 }
 
 /* Multi-Select Toolbar */
@@ -250,6 +281,12 @@
     margin-bottom: 8px;
 }
 
+.method-selection select option {
+    background: #2a2a2a;
+    color: white;
+    padding: 8px;
+}
+
 .method-selection select:focus {
     outline: none;
     border-color: #4fc3f7;
@@ -598,6 +635,18 @@
     .notification {
         max-width: none;
     }
+    
+    /* Adjust disconnect button for mobile */
+    .gpu-disconnect-button {
+        padding: 0.5rem 1rem;
+        font-size: 0.75rem;
+        gap: 0.35rem;
+    }
+    
+    .gpu-disconnect-container {
+        right: 10px !important;
+        top: 60px !important;
+    }
 }
 
 /* Dark mode adjustments */
diff --git a/static/js/gpu-cards.js b/static/js/gpu-cards.js
index 1b1b24d..514de15 100644
--- a/static/js/gpu-cards.js
+++ b/static/js/gpu-cards.js
@@ -10,12 +10,6 @@ function createOverviewCard(gpuId, gpuInfo) {
 
     return `
         <div class="overview-gpu-card" data-gpu-id="${gpuId}" onclick="switchToView('gpu-${gpuId}')" style="pointer-events: auto; position: relative;">
-            <div class="gpu-select-container" style="position: absolute; top: 10px; right: 10px; z-index: 10;">
-                <label onclick="event.stopPropagation();">
-                    <input type="checkbox" class="gpu-select-checkbox" data-gpu-id="${gpuId}">
-                    Select
-                </label>
-            </div>
             <div class="overview-header">
                 <div>
                     <h2 style="font-size: 1.5rem; font-weight: 700; background: var(--primary-gradient); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; margin-bottom: 0.25rem;">
@@ -638,18 +632,6 @@ function createGPUCard(gpuId, gpuInfo) {
                 </div>` : ''}
             </div>
             
-            <!-- GPU Actions Section -->
-            <div class="gpu-actions">
-                <div class="gpu-select-container">
-                    <label>
-                        <input type="checkbox" class="gpu-select-checkbox" data-gpu-id="${gpuId}">
-                        Select
-                    </label>
-                </div>
-                <button class="disconnect-button" onclick="showDisconnectModal(${gpuId})">
-                    <span class="disconnect-icon">⚡</span> Disconnect
-                </button>
-            </div>
         </div>
     `;
 }
diff --git a/static/js/gpu-disconnect.js b/static/js/gpu-disconnect.js
index 5ca469a..d379d27 100644
--- a/static/js/gpu-disconnect.js
+++ b/static/js/gpu-disconnect.js
@@ -101,12 +101,7 @@ function setupDisconnectEventListeners() {
         }
     });
     
-    // Listen for multi-select changes
-    document.addEventListener('change', (e) => {
-        if (e.target.classList.contains('gpu-select-checkbox')) {
-            handleGPUSelection(e);
-        }
-    });
+    // Multi-select functionality removed - using individual disconnect buttons now
 }
 
 /**
@@ -142,38 +137,50 @@ function addDisconnectButton(gpuId, gpuCard, nodeInfo = null) {
  * Add multi-select checkbox to GPU card
  */
 function addGPUSelectCheckbox(gpuId, gpuCard, nodeInfo = null) {
-    // Check if checkbox already exists
-    if (gpuCard.querySelector('.gpu-select-checkbox')) {
+    // Check if disconnect button already exists
+    if (gpuCard.querySelector('.gpu-disconnect-button')) {
         return;
     }
     
-    // Create checkbox container
-    const checkboxContainer = document.createElement('div');
-    checkboxContainer.className = 'gpu-select-container';
+    // Create disconnect button container
+    const disconnectContainer = document.createElement('div');
+    disconnectContainer.className = 'gpu-disconnect-container';
     
-    const checkbox = document.createElement('input');
-    checkbox.type = 'checkbox';
-    checkbox.className = 'gpu-select-checkbox';
-    checkbox.dataset.gpuId = gpuId;
+    // Create pill-shaped disconnect button
+    const disconnectButton = document.createElement('button');
+    disconnectButton.className = 'gpu-disconnect-button';
+    disconnectButton.dataset.gpuId = gpuId;
     if (nodeInfo) {
-        checkbox.dataset.nodeName = nodeInfo.node_name;
+        disconnectButton.dataset.nodeName = nodeInfo.node_name;
     }
     
-    const label = document.createElement('label');
-    label.appendChild(checkbox);
-    label.appendChild(document.createTextNode(' Select'));
+    // Add icon and text
+    const iconSpan = document.createElement('span');
+    iconSpan.className = 'disconnect-dot';
+    disconnectButton.appendChild(iconSpan);
     
-    checkboxContainer.appendChild(label);
+    const textSpan = document.createElement('span');
+    textSpan.className = 'disconnect-text';
+    textSpan.textContent = 'Simulate Disconnect';
+    disconnectButton.appendChild(textSpan);
     
-    // Add to GPU card header
-    const header = gpuCard.querySelector('.gpu-header') || gpuCard.querySelector('h3');
-    if (header) {
-        header.style.position = 'relative';
-        checkboxContainer.style.position = 'absolute';
-        checkboxContainer.style.right = '10px';
-        checkboxContainer.style.top = '10px';
-        header.appendChild(checkboxContainer);
-    }
+    // Add click handler
+    disconnectButton.addEventListener('click', (e) => {
+        e.stopPropagation(); // Prevent card click
+        showDisconnectModal(gpuId, nodeInfo);
+    });
+    
+    disconnectContainer.appendChild(disconnectButton);
+    
+    // Position at top-right of the GPU card, aligned with ONLINE badge
+    disconnectContainer.style.position = 'absolute';
+    disconnectContainer.style.right = '200px';
+    disconnectContainer.style.top = '35px';
+    disconnectContainer.style.zIndex = '10';
+    
+    // Add to GPU card (not header, so it's positioned relative to the card)
+    gpuCard.style.position = 'relative';
+    gpuCard.appendChild(disconnectContainer);
 }
 
 /**
@@ -238,14 +245,6 @@ function createDisconnectModal(gpuId, methods, nodeInfo) {
                 </div>
                 
                 <div class="modal-content">
-                    <div class="disconnect-warning">
-                        <div class="warning-icon">⚠️</div>
-                        <div class="warning-text">
-                            <strong>Caution:</strong> This will temporarily disconnect the GPU, interrupting any running processes.
-                            The GPU will automatically reconnect after the specified time.
-                        </div>
-                    </div>
-                    
                     <div class="method-selection">
                         <label>Disconnect Method:</label>
                         <select id="disconnect-method-select">
@@ -747,9 +746,6 @@ function handleGPUSelection(event) {
  */
 function clearGPUSelection() {
     disconnectState.selectedGpus.clear();
-    document.querySelectorAll('.gpu-select-checkbox').forEach(cb => {
-        cb.checked = false;
-    });
     updateMultiSelectUI();
 }
 
@@ -895,10 +891,11 @@ function updateDisconnectButtonState(button, gpuId) {
 function formatMethodName(method) {
     const names = {
         'auto': 'Auto (Best Available)',
-        'slot': 'Slot Power Toggle',
-        'hot': 'Hot Reset',
-        'logical': 'Logical Remove/Rescan',
-        'nvidia': 'NVIDIA GPU Reset'
+        'slot': 'Slot Power Toggle (Linux)',
+        'hot': 'Hot Reset (Linux)',
+        'logical': 'Logical Remove/Rescan (Linux)',
+        'nvidia': 'NVIDIA GPU Reset (Linux)',
+        'memory_flood': 'Memory Flood ⚠️ EXPERIMENTAL (WSL2/Docker/Linux)'
     };
     return names[method] || method.charAt(0).toUpperCase() + method.slice(1);
 }
@@ -909,10 +906,11 @@ function formatMethodName(method) {
 function getMethodDescription(method) {
     const descriptions = {
         'auto': 'Automatically select the most realistic method available on this system.',
-        'slot': 'Actually cut and restore slot power (closest to physical disconnect).',
-        'hot': 'Reset the PCIe link using upstream bridge controls.',
-        'logical': 'Software-only remove and re-scan (no hardware reset).',
-        'nvidia': 'Use NVIDIA driver reset functionality.'
+        'slot': 'Actually cut and restore slot power (closest to physical disconnect). Linux only.',
+        'hot': 'Reset the PCIe link using upstream bridge controls. Linux only.',
+        'logical': 'Software-only remove and re-scan. Linux only.',
+        'nvidia': 'Use NVIDIA driver reset functionality. Linux only.',
+        'memory_flood': '⚠️ EXPERIMENTAL: Floods GPU memory to trigger OOM/driver reset. May cause system instability! This is the only method available in WSL2/Docker.'
     };
     return descriptions[method] || 'Custom disconnect method.';
 }
diff --git a/static/js/socket-handlers.js b/static/js/socket-handlers.js
index b1a69de..c2cda37 100644
--- a/static/js/socket-handlers.js
+++ b/static/js/socket-handlers.js
@@ -106,6 +106,12 @@ socket.onmessage = function(event) {
             overviewContainer.insertAdjacentHTML('beforeend', createOverviewCard(gpuId, gpuInfo));
             initOverviewMiniChart(gpuId, gpuInfo.utilization);
             lastDOMUpdate[gpuId] = now;
+            
+            // Add disconnect button to the card
+            const card = overviewContainer.querySelector(`[data-gpu-id="${gpuId}"]`);
+            if (card && window.addGPUSelectCheckbox) {
+                window.addGPUSelectCheckbox(gpuId, card);
+            }
         }
     });
     
@@ -331,6 +337,12 @@ function handleClusterData(data) {
                     nodeGrid.insertAdjacentHTML('beforeend', createClusterGPUCard(nodeName, gpuId, gpuInfo));
                     initOverviewMiniChart(fullGpuId, gpuInfo.utilization);
                     lastDOMUpdate[fullGpuId] = now;
+                    
+                    // Add disconnect button to the card
+                    const card = nodeGrid.querySelector(`[data-gpu-id="${fullGpuId}"]`);
+                    if (card && window.addGPUSelectCheckbox) {
+                        window.addGPUSelectCheckbox(fullGpuId, card, { node_name: nodeName });
+                    }
                 }
             });
         } else {
diff --git a/static/js/ui.js b/static/js/ui.js
index 54268fe..1bd450e 100644
--- a/static/js/ui.js
+++ b/static/js/ui.js
@@ -84,6 +84,12 @@ function ensureGPUTab(gpuId, gpuInfo, shouldUpdateDOM = true) {
         // Do not reinitialize chartData here; it would break existing chart references
         if (!chartData[gpuId]) initGPUData(gpuId);
         initGPUCharts(gpuId);
+        
+        // Add disconnect button to the card
+        const card = document.getElementById(`gpu-${gpuId}`);
+        if (card && window.addGPUSelectCheckbox) {
+            window.addGPUSelectCheckbox(gpuId, card);
+        }
     } else if (existingCard) {
         updateGPUDisplay(gpuId, gpuInfo, shouldUpdateDOM);
     }