Major features: - GPU auto-reset on CUDA errors with cooldown protection (handles sleep/wake) - Async job queue system for long-running transcriptions - Comprehensive GPU health monitoring with real model tests - Phase 1 component testing with detailed logging New modules: - src/core/gpu_reset.py: GPU driver reset with 5-min cooldown - src/core/gpu_health.py: Real GPU health checks using model inference - src/core/job_queue.py: FIFO queue with background worker and persistence - src/utils/test_audio_generator.py: Test audio generation for GPU checks - test_phase1.py: Component tests with logging - reset_gpu.sh: GPU driver reset script Updates: - CLAUDE.md: Added GPU auto-reset docs and passwordless sudo setup - requirements.txt: Updated to PyTorch CUDA 12.4 - Model manager: Integrated GPU health check with reset - Both servers: Added startup GPU validation with auto-reset - Startup scripts: Added GPU_RESET_COOLDOWN_MINUTES env var
71 lines
2.5 KiB
Bash
Executable File
71 lines
2.5 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Script to reset NVIDIA GPU drivers without rebooting
|
|
# This reloads kernel modules and restarts nvidia-persistenced service
|
|
|
|
echo "============================================================"
|
|
echo "NVIDIA GPU Driver Reset Script"
|
|
echo "============================================================"
|
|
echo ""
|
|
|
|
# Stop nvidia-persistenced service
|
|
echo "Stopping nvidia-persistenced service..."
|
|
sudo systemctl stop nvidia-persistenced
|
|
if [ $? -eq 0 ]; then
|
|
echo "✓ nvidia-persistenced stopped"
|
|
else
|
|
echo "✗ Failed to stop nvidia-persistenced"
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
|
|
# Unload NVIDIA kernel modules (in correct order)
|
|
echo "Unloading NVIDIA kernel modules..."
|
|
sudo rmmod nvidia_uvm 2>/dev/null && echo "✓ nvidia_uvm unloaded" || echo " nvidia_uvm not loaded or failed to unload"
|
|
sudo rmmod nvidia_drm 2>/dev/null && echo "✓ nvidia_drm unloaded" || echo " nvidia_drm not loaded or failed to unload"
|
|
sudo rmmod nvidia_modeset 2>/dev/null && echo "✓ nvidia_modeset unloaded" || echo " nvidia_modeset not loaded or failed to unload"
|
|
sudo rmmod nvidia 2>/dev/null && echo "✓ nvidia unloaded" || echo " nvidia not loaded or failed to unload"
|
|
echo ""
|
|
|
|
# Small delay to ensure clean unload
|
|
sleep 1
|
|
|
|
# Reload NVIDIA kernel modules (in correct order)
|
|
echo "Loading NVIDIA kernel modules..."
|
|
sudo modprobe nvidia && echo "✓ nvidia loaded" || { echo "✗ Failed to load nvidia"; exit 1; }
|
|
sudo modprobe nvidia_modeset && echo "✓ nvidia_modeset loaded" || { echo "✗ Failed to load nvidia_modeset"; exit 1; }
|
|
sudo modprobe nvidia_uvm && echo "✓ nvidia_uvm loaded" || { echo "✗ Failed to load nvidia_uvm"; exit 1; }
|
|
sudo modprobe nvidia_drm && echo "✓ nvidia_drm loaded" || { echo "✗ Failed to load nvidia_drm"; exit 1; }
|
|
echo ""
|
|
|
|
# Restart nvidia-persistenced service
|
|
echo "Starting nvidia-persistenced service..."
|
|
sudo systemctl start nvidia-persistenced
|
|
if [ $? -eq 0 ]; then
|
|
echo "✓ nvidia-persistenced started"
|
|
else
|
|
echo "✗ Failed to start nvidia-persistenced"
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
|
|
# Verify GPU is accessible
|
|
echo "Verifying GPU accessibility..."
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
|
|
if [ $? -eq 0 ]; then
|
|
echo "✓ GPU reset successful"
|
|
else
|
|
echo "✗ GPU not accessible"
|
|
exit 1
|
|
fi
|
|
else
|
|
echo "✗ nvidia-smi not found"
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
|
|
echo "============================================================"
|
|
echo "GPU driver reset completed successfully"
|
|
echo "============================================================"
|