Files
drone-footage-manager/backend/filesystem_health.py
Alihan dec49a43f9 Add filesystem health monitoring and compression queue system
- Implement periodic filesystem write permission checks (60-minute intervals)
- Add real-time health status monitoring with SSE endpoints
- Display system health banner when storage issues detected
- Limit compression to 1 concurrent job with queue support
- Add max queue limit of 10 pending jobs
- Show queue positions for pending compression jobs
- Update button text dynamically (Start/Queue Compression)
- Enable write access to footage mount in Docker
- Add comprehensive logging for health checks and compression

Co-Authored-By: Alihan <alihan@example.com>
2025-10-12 22:54:21 +03:00

171 lines
6.3 KiB
Python

import asyncio
import logging
import time
from pathlib import Path
from typing import Dict, Optional
from datetime import datetime
# Configure logging
logger = logging.getLogger(__name__)
# Configuration constants
HEALTH_CHECK_INTERVAL_SECONDS = 3600 # Check every 60 minutes
TEST_FILE_PREFIX = ".write_test_"
class FilesystemHealthChecker:
"""
Monitors filesystem write permissions by periodically attempting to write a test file.
Tracks health status and provides real-time updates to the application.
"""
def __init__(self, base_path: Path):
self.base_path = base_path.resolve()
self.is_healthy = True
self.last_check_time: Optional[datetime] = None
self.error_message: Optional[str] = None
self._monitoring_task: Optional[asyncio.Task] = None
self._status_change_callbacks = []
def add_status_change_callback(self, callback):
"""Register a callback to be notified when health status changes"""
self._status_change_callbacks.append(callback)
async def _notify_status_change(self):
"""Notify all registered callbacks of status change"""
for callback in self._status_change_callbacks:
try:
if asyncio.iscoroutinefunction(callback):
await callback(self.get_status())
else:
callback(self.get_status())
except Exception as e:
logger.error(f"Error in status change callback: {e}")
async def check_write_permission(self) -> bool:
"""
Attempt to write a test file to verify write permissions.
Returns True if write successful, False otherwise.
"""
test_file_path = None
try:
# Generate unique test file name with timestamp
timestamp = int(time.time() * 1000)
test_file_name = f"{TEST_FILE_PREFIX}{timestamp}"
test_file_path = self.base_path / test_file_name
logger.debug(f"Testing write permission: {test_file_path}")
# Attempt to write test file
test_file_path.write_text(f"Health check at {datetime.now().isoformat()}\n")
# Verify file exists and is readable
if not test_file_path.exists():
raise IOError("Test file was not created successfully")
content = test_file_path.read_text()
if not content:
raise IOError("Test file is empty after write")
# Clean up test file immediately
test_file_path.unlink()
logger.debug("Write permission test passed")
return True
except PermissionError as e:
logger.error(f"Permission denied writing to {self.base_path}: {e}")
self.error_message = f"Permission denied: {str(e)}"
return False
except OSError as e:
logger.error(f"OS error writing to {self.base_path}: {e}")
if "Read-only file system" in str(e):
self.error_message = "Filesystem is mounted as read-only"
else:
self.error_message = f"OS error: {str(e)}"
return False
except Exception as e:
logger.error(f"Unexpected error during write test: {e}", exc_info=True)
self.error_message = f"Unexpected error: {str(e)}"
return False
finally:
# Ensure cleanup even if error occurs
if test_file_path and test_file_path.exists():
try:
test_file_path.unlink()
logger.debug(f"Cleaned up test file: {test_file_path}")
except Exception as e:
logger.warning(f"Failed to clean up test file {test_file_path}: {e}")
async def perform_health_check(self) -> Dict:
"""
Perform a single health check and update status.
Returns the current health status.
"""
previous_health = self.is_healthy
self.last_check_time = datetime.now()
can_write = await self.check_write_permission()
if can_write:
self.is_healthy = True
self.error_message = None
logger.info(f"Filesystem health check PASSED at {self.last_check_time.isoformat()}")
else:
self.is_healthy = False
logger.error(
f"Filesystem health check FAILED at {self.last_check_time.isoformat()}: "
f"{self.error_message}"
)
# Notify if status changed
if previous_health != self.is_healthy:
await self._notify_status_change()
return self.get_status()
def get_status(self) -> Dict:
"""Get current health status"""
return {
"healthy": self.is_healthy,
"last_check": self.last_check_time.isoformat() if self.last_check_time else None,
"error": self.error_message,
"base_path": str(self.base_path),
}
async def _monitoring_loop(self):
"""Background task that periodically checks filesystem health"""
interval_minutes = HEALTH_CHECK_INTERVAL_SECONDS / 60
logger.info(
f"Starting filesystem health monitoring for {self.base_path} "
f"(interval: {interval_minutes:.0f} minutes)"
)
while True:
try:
await self.perform_health_check()
await asyncio.sleep(HEALTH_CHECK_INTERVAL_SECONDS)
except asyncio.CancelledError:
logger.info("Filesystem health monitoring stopped")
break
except Exception as e:
logger.error(f"Error in health monitoring loop: {e}", exc_info=True)
await asyncio.sleep(HEALTH_CHECK_INTERVAL_SECONDS)
def start_monitoring(self):
"""Start the background health monitoring task"""
if self._monitoring_task is None or self._monitoring_task.done():
self._monitoring_task = asyncio.create_task(self._monitoring_loop())
logger.info("Filesystem health monitoring started")
else:
logger.warning("Monitoring task already running")
def stop_monitoring(self):
"""Stop the background health monitoring task"""
if self._monitoring_task and not self._monitoring_task.done():
self._monitoring_task.cancel()
logger.info("Filesystem health monitoring stopped")