Files
drone-footage-manager/backend/main.py
Alihan 92338df716 Fix compression job queue persistence and recovery after Docker restart
This commit ensures that compression jobs survive Docker container restarts
and are automatically recovered and restarted.

Changes:
- Modified CancelledError handler to preserve job status during shutdown
- Jobs now keep 'processing' or 'validating' status instead of being marked
  as 'cancelled' when the app shuts down
- Added job persistence layer using SQLite database
- Implemented automatic job recovery on application startup
- Added process cleanup utilities for orphaned ffmpeg processes
- User-initiated cancellations still properly mark jobs as cancelled
- Jobs are visible in frontend after recovery

The recovery system:
1. Detects interrupted jobs (processing/validating status)
2. Cleans up orphaned ffmpeg processes and temp files
3. Restarts interrupted jobs from beginning
4. Maintains queue order and respects concurrency limits
5. Works with multiple jobs in the queue

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-19 02:57:04 +03:00

656 lines
23 KiB
Python

from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, StreamingResponse, Response
from pathlib import Path
from typing import List, Dict, Optional, Any
from pydantic import BaseModel
import os
from datetime import datetime
import aiofiles
import aiofiles.os
import asyncio
import json
import time
import logging
import mimetypes
from sse_starlette.sse import EventSourceResponse
from compression import CompressionManager
from filesystem_health import FilesystemHealthChecker
from process_utils import (
kill_process_safely,
cleanup_temp_files,
find_orphaned_ffmpeg_processes
)
# Configure logging
logger = logging.getLogger(__name__)
app = FastAPI(title="Drone Footage Manager API")
# Configuration constants
STREAM_CHUNK_SIZE = 1024 * 1024 # 1MB chunks for video streaming
SSE_UPDATE_INTERVAL = 0.5 # Update every 500ms
CACHE_TTL_SECONDS = 60 # Cache directory listings for 60 seconds
# Job recovery configuration
AUTO_RESTART_INTERRUPTED_JOBS = os.getenv('AUTO_RESTART_INTERRUPTED_JOBS', 'true').lower() == 'true'
CLEANUP_ORPHANED_PROCESSES = os.getenv('CLEANUP_ORPHANED_PROCESSES', 'true').lower() == 'true'
# Base path for footages
FOOTAGES_PATH = Path("/footages")
# Simple in-memory cache for directory listings
class SimpleCache:
def __init__(self, ttl_seconds: int = CACHE_TTL_SECONDS):
self.cache: Dict[str, tuple[float, Any]] = {}
self.ttl = ttl_seconds
def get(self, key: str) -> Optional[Any]:
if key in self.cache:
timestamp, value = self.cache[key]
if time.time() - timestamp < self.ttl:
return value
else:
del self.cache[key]
return None
def set(self, key: str, value: Any):
self.cache[key] = (time.time(), value)
def clear(self):
self.cache.clear()
def invalidate(self, pattern: str = None):
"""Invalidate cache entries matching pattern or all if pattern is None"""
if pattern is None:
self.cache.clear()
else:
keys_to_delete = [k for k in self.cache.keys() if pattern in k]
for key in keys_to_delete:
del self.cache[key]
directory_cache = SimpleCache()
# Initialize filesystem health checker first (compression manager needs it)
filesystem_health_checker = FilesystemHealthChecker(FOOTAGES_PATH)
# Initialize compression manager with health checker and cache callback
compression_manager = CompressionManager(
max_concurrent=1,
allowed_base_path=FOOTAGES_PATH,
health_checker=filesystem_health_checker,
cache_invalidation_callback=directory_cache.invalidate
)
# CORS middleware for frontend communication
ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",")
app.add_middleware(
CORSMiddleware,
allow_origins=ALLOWED_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Supported video and image extensions
VIDEO_EXTENSIONS = {".mp4", ".MP4", ".mov", ".MOV", ".avi", ".AVI"}
IMAGE_EXTENSIONS = {".jpg", ".JPG", ".jpeg", ".JPEG", ".png", ".PNG"}
def is_media_file(filename: str) -> bool:
"""Check if file is a video or image"""
ext = Path(filename).suffix
return ext in VIDEO_EXTENSIONS or ext in IMAGE_EXTENSIONS
async def get_file_info(file_path: Path) -> Dict:
"""Get file metadata"""
stat = await aiofiles.os.stat(file_path)
return {
"name": file_path.name,
"size": stat.st_size,
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"is_video": file_path.suffix in VIDEO_EXTENSIONS,
"is_image": file_path.suffix in IMAGE_EXTENSIONS,
}
async def recover_compression_jobs():
"""
Recover compression jobs from database after app restart/crash
This function:
1. Loads all jobs from database
2. Handles interrupted jobs (processing/validating)
3. Cleans up orphaned ffmpeg processes
4. Restarts pending jobs automatically
"""
logger.info("=" * 60)
logger.info("Starting compression job recovery...")
logger.info("=" * 60)
# Initialize persistence layer
await compression_manager.initialize_persistence()
# Load all jobs from database
await compression_manager.load_jobs_from_database()
# Get interrupted jobs (were processing when app crashed)
interrupted_jobs = await compression_manager.persistence.get_interrupted_jobs()
if interrupted_jobs:
logger.info(f"Found {len(interrupted_jobs)} interrupted jobs")
for job_data in interrupted_jobs:
job_id = job_data['job_id']
file_path = job_data['file_path']
ffmpeg_pid = job_data.get('ffmpeg_pid')
logger.info(f"Processing interrupted job {job_id}: {Path(file_path).name}")
# Kill orphaned ffmpeg process if it exists and is verified as ours
if ffmpeg_pid and CLEANUP_ORPHANED_PROCESSES:
killed = kill_process_safely(ffmpeg_pid, file_path, timeout=10)
if killed:
logger.info(f" ✓ Killed orphaned ffmpeg process PID {ffmpeg_pid}")
else:
logger.warning(f" ⚠ Could not verify/kill PID {ffmpeg_pid}")
# Clean up temp files
cleaned = cleanup_temp_files(file_path)
if cleaned > 0:
logger.info(f" ✓ Cleaned up {cleaned} temp file(s)")
# Decide what to do with interrupted job
if AUTO_RESTART_INTERRUPTED_JOBS:
# Restart the job
logger.info(f" ⟳ Restarting interrupted job...")
await compression_manager.persistence.update_job_status(
job_id, "pending", progress=0.0
)
# Update in-memory job status
if job_id in compression_manager.jobs:
compression_manager.jobs[job_id].status = "pending"
compression_manager.jobs[job_id].progress = 0.0
compression_manager.jobs[job_id].ffmpeg_pid = None
else:
# Mark as failed
logger.info(f" ✗ Marking as failed (auto-restart disabled)")
await compression_manager.persistence.update_job_status(
job_id, "failed",
error="Job interrupted by application restart/crash"
)
# Update in-memory job status
if job_id in compression_manager.jobs:
compression_manager.jobs[job_id].status = "failed"
compression_manager.jobs[job_id].error = "Job interrupted by application restart/crash"
compression_manager.jobs[job_id].ffmpeg_pid = None
else:
logger.info("No interrupted jobs found")
# Get pending jobs
pending_jobs = await compression_manager.persistence.get_jobs_by_status("pending")
if pending_jobs:
logger.info(f"Found {len(pending_jobs)} pending jobs - will be processed automatically")
# Restart pending jobs
for job_data in pending_jobs:
job_id = job_data['job_id']
if job_id in compression_manager.jobs:
job = compression_manager.jobs[job_id]
# Only restart if not already in active jobs
if job_id not in compression_manager.active_jobs:
logger.info(f" ⟳ Restarting pending job: {Path(job.file_path).name}")
task = asyncio.create_task(compression_manager.compress_video(job))
compression_manager.active_jobs[job_id] = task
task.add_done_callback(lambda t: compression_manager.active_jobs.pop(job_id, None))
else:
logger.info("No pending jobs to restart")
# Optional: Find any other orphaned ffmpeg processes in our path
if CLEANUP_ORPHANED_PROCESSES:
orphaned = find_orphaned_ffmpeg_processes(FOOTAGES_PATH)
if orphaned:
logger.warning(f"Found {len(orphaned)} untracked ffmpeg processes in our path:")
for proc in orphaned:
logger.warning(f" PID {proc['pid']}: {proc['cmdline'][:100]}...")
logger.warning(f" ⚠ NOT killing (not in our database) - manual review recommended")
# Clean up old completed/cancelled jobs
deleted = await compression_manager.persistence.cleanup_old_jobs()
logger.info("=" * 60)
logger.info("Compression job recovery complete")
logger.info("=" * 60)
@app.on_event("startup")
async def startup_event():
"""Run startup tasks"""
logger.info("Running startup tasks...")
# Perform initial filesystem health check
initial_status = await filesystem_health_checker.perform_health_check()
if initial_status["healthy"]:
logger.info("✓ Initial filesystem health check PASSED")
else:
logger.error(
f"✗ Initial filesystem health check FAILED: {initial_status['error']}"
)
# Start background monitoring
filesystem_health_checker.start_monitoring()
# Recover compression jobs from database
await recover_compression_jobs()
logger.info("Application startup complete")
@app.on_event("shutdown")
async def shutdown_event():
"""Run shutdown tasks"""
logger.info("Shutting down...")
filesystem_health_checker.stop_monitoring()
@app.get("/")
async def root():
return {"message": "Drone Footage Manager API", "status": "running"}
@app.get("/api/locations")
async def get_locations() -> List[Dict]:
"""Get list of all location folders with metadata"""
# Check cache first
cached = directory_cache.get("locations")
if cached is not None:
return cached
if not FOOTAGES_PATH.exists():
raise HTTPException(status_code=500, detail="Footages directory not found")
locations = []
for item in FOOTAGES_PATH.iterdir():
if item.is_dir():
stat = await aiofiles.os.stat(item)
locations.append({
"name": item.name,
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat()
})
# Cache the result
directory_cache.set("locations", locations)
return locations
@app.get("/api/locations/{location}/dates")
async def get_dates(location: str) -> List[Dict]:
"""Get list of date folders for a location with metadata"""
# Check cache first
cache_key = f"dates:{location}"
cached = directory_cache.get(cache_key)
if cached is not None:
return cached
# Sanitize path components to prevent traversal
if ".." in location or "/" in location:
raise HTTPException(status_code=400, detail="Invalid path characters")
location_path = (FOOTAGES_PATH / location).resolve()
# Ensure resolved path is still within FOOTAGES_PATH
try:
location_path.relative_to(FOOTAGES_PATH.resolve())
except ValueError:
raise HTTPException(status_code=403, detail="Access denied")
if not location_path.exists() or not location_path.is_dir():
raise HTTPException(status_code=404, detail="Location not found")
dates = []
has_files_in_root = False
for item in location_path.iterdir():
if item.is_dir():
stat = await aiofiles.os.stat(item)
dates.append({
"name": item.name,
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat()
})
elif item.is_file():
has_files_in_root = True
# If no date folders but has files in root, return special marker
if not dates and has_files_in_root:
dates.append({
"name": "__root__",
"modified": None,
"message": "📁 Files not organized by date"
})
# Cache the result
directory_cache.set(cache_key, dates)
return dates
@app.get("/api/files/{location}/{date}")
async def get_files(location: str, date: str) -> List[Dict]:
"""Get list of files for a location and date"""
# Check cache first
cache_key = f"files:{location}:{date}"
cached = directory_cache.get(cache_key)
if cached is not None:
return cached
# Sanitize path components to prevent traversal
if ".." in location or ".." in date or "/" in location or "/" in date:
raise HTTPException(status_code=400, detail="Invalid path characters")
# Handle special __root__ marker for locations with files in root
if date == "__root__":
files_path = (FOOTAGES_PATH / location).resolve()
else:
files_path = (FOOTAGES_PATH / location / date).resolve()
# Ensure resolved path is still within FOOTAGES_PATH
try:
files_path.relative_to(FOOTAGES_PATH.resolve())
except ValueError:
raise HTTPException(status_code=403, detail="Access denied")
if not files_path.exists() or not files_path.is_dir():
raise HTTPException(status_code=404, detail="Path not found")
files = []
for item in sorted(files_path.iterdir()):
if item.is_file() and is_media_file(item.name):
files.append(await get_file_info(item))
# Cache the result
directory_cache.set(cache_key, files)
return files
@app.get("/api/stream/{location}/{date}/{filename}")
async def stream_video(location: str, date: str, filename: str, request: Request):
"""Stream video file with HTTP range request support for fast seeking"""
# Sanitize path components to prevent traversal
if ".." in location or ".." in date or ".." in filename or "/" in location or "/" in date or "/" in filename:
raise HTTPException(status_code=400, detail="Invalid path characters")
# Handle __root__ case (files not in date subdirectories)
if date == "__root__":
file_path = (FOOTAGES_PATH / location / filename).resolve()
else:
file_path = (FOOTAGES_PATH / location / date / filename).resolve()
# Ensure resolved path is still within FOOTAGES_PATH
try:
file_path.relative_to(FOOTAGES_PATH.resolve())
except ValueError:
raise HTTPException(status_code=403, detail="Access denied")
if not file_path.exists() or not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found")
# Check if it's a video file
if file_path.suffix not in VIDEO_EXTENSIONS:
raise HTTPException(status_code=400, detail="Not a video file")
# Get file size
file_stat = await aiofiles.os.stat(file_path)
file_size = file_stat.st_size
# Parse range header
range_header = request.headers.get("range")
if range_header:
# Parse range header (e.g., "bytes=0-1023")
range_match = range_header.replace("bytes=", "").split("-")
start = int(range_match[0]) if range_match[0] else 0
end = int(range_match[1]) if range_match[1] else file_size - 1
end = min(end, file_size - 1)
# Calculate content length
content_length = end - start + 1
# Create streaming response
async def iterfile():
async with aiofiles.open(file_path, mode='rb') as f:
await f.seek(start)
remaining = content_length
while remaining > 0:
chunk = await f.read(min(STREAM_CHUNK_SIZE, remaining))
if not chunk:
break
remaining -= len(chunk)
yield chunk
headers = {
"Content-Range": f"bytes {start}-{end}/{file_size}",
"Accept-Ranges": "bytes",
"Content-Length": str(content_length),
"Content-Type": mimetypes.guess_type(file_path)[0] or "video/mp4",
}
return StreamingResponse(
iterfile(),
status_code=206,
headers=headers,
media_type=mimetypes.guess_type(file_path)[0] or "video/mp4"
)
# No range header - return full file
return FileResponse(
file_path,
media_type=mimetypes.guess_type(file_path)[0] or "video/mp4",
headers={"Accept-Ranges": "bytes"}
)
@app.get("/api/image/{location}/{date}/{filename}")
async def get_image(location: str, date: str, filename: str):
"""Serve image file"""
# Sanitize path components to prevent traversal
if ".." in location or ".." in date or ".." in filename or "/" in location or "/" in date or "/" in filename:
raise HTTPException(status_code=400, detail="Invalid path characters")
# Handle __root__ case (files not in date subdirectories)
if date == "__root__":
file_path = (FOOTAGES_PATH / location / filename).resolve()
else:
file_path = (FOOTAGES_PATH / location / date / filename).resolve()
# Ensure resolved path is still within FOOTAGES_PATH
try:
file_path.relative_to(FOOTAGES_PATH.resolve())
except ValueError:
raise HTTPException(status_code=403, detail="Access denied")
if not file_path.exists() or not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found")
# Check if it's an image file
if file_path.suffix not in IMAGE_EXTENSIONS:
raise HTTPException(status_code=400, detail="Not an image file")
# Determine media type dynamically
media_type = mimetypes.guess_type(file_path)[0] or "image/jpeg"
return FileResponse(file_path, media_type=media_type)
# ========== COMPRESSION API ENDPOINTS ==========
class CompressionRequest(BaseModel):
location: str
date: str
filename: str
reduce_percentage: int
@app.post("/api/compress/start")
async def start_compression(request: CompressionRequest):
"""Start a compression job"""
if not 1 <= request.reduce_percentage <= 90:
raise HTTPException(status_code=400, detail="Percentage must be between 1-90")
# Handle __root__ case (files not in date subdirectories)
if request.date == "__root__":
file_path = FOOTAGES_PATH / request.location / request.filename
else:
file_path = FOOTAGES_PATH / request.location / request.date / request.filename
if not file_path.exists():
raise HTTPException(status_code=404, detail="File not found")
if file_path.suffix not in VIDEO_EXTENSIONS:
raise HTTPException(status_code=400, detail="File is not a video")
job_id = await compression_manager.start_compression(str(file_path), request.reduce_percentage)
return {"job_id": job_id, "status": "started"}
@app.get("/api/compress/jobs")
async def get_all_jobs():
"""Get all compression jobs"""
jobs = []
# Use snapshot to avoid race condition during iteration
for job in await compression_manager.get_jobs_snapshot():
jobs.append({
"job_id": job.job_id,
"file_path": job.file_path,
"file_name": Path(job.file_path).name,
"reduce_percentage": job.reduce_percentage,
"status": job.status,
"progress": round(job.progress, 1),
"eta_seconds": job.eta_seconds,
"current_pass": job.current_pass,
"current_size_mb": round(job.current_size_mb, 2) if job.current_size_mb else None,
"target_size_mb": round(job.target_size_mb, 2) if job.target_size_mb else None,
"video_bitrate": job.video_bitrate,
"created_at": job.created_at.isoformat() if job.created_at else None,
"output_file": Path(job.output_file).name if job.output_file else None,
"error": job.error
})
return jobs
@app.get("/api/compress/jobs/{job_id}")
async def get_job_status(job_id: str):
"""Get status of specific compression job"""
if job_id not in compression_manager.jobs:
raise HTTPException(status_code=404, detail="Job not found")
job = compression_manager.jobs[job_id]
return {
"job_id": job.job_id,
"status": job.status,
"progress": round(job.progress, 1),
"eta_seconds": job.eta_seconds,
"current_pass": job.current_pass,
"output_file": Path(job.output_file).name if job.output_file else None,
"error": job.error
}
@app.delete("/api/compress/jobs/{job_id}")
async def delete_job(job_id: str, action: str = "cancel"):
"""Delete or cancel a compression job
Args:
job_id: The job ID to delete
action: 'cancel' to cancel a running job, 'remove' to remove from list
"""
if job_id not in compression_manager.jobs:
raise HTTPException(status_code=404, detail="Job not found")
if action == "remove":
# Remove completed/failed job from list
success = await compression_manager.remove_job(job_id)
if success:
return {"status": "removed"}
else:
raise HTTPException(status_code=400, detail="Cannot remove active job")
else:
# Cancel running job
await compression_manager.cancel_job(job_id)
return {"status": "cancelled"}
@app.get("/api/compress/events")
async def compression_events(request: Request):
"""Server-Sent Events endpoint for real-time progress updates"""
async def event_generator():
try:
while True:
# Check if client is still connected
if await request.is_disconnected():
break
# Send status of all active jobs (use snapshot to avoid race condition)
active_jobs = []
for job in await compression_manager.get_jobs_snapshot():
if job.status in ["pending", "processing", "validating"]:
active_jobs.append({
"job_id": job.job_id,
"status": job.status,
"progress": round(job.progress, 1),
"eta_seconds": job.eta_seconds,
"current_pass": job.current_pass
})
if active_jobs:
yield {
"event": "progress",
"data": json.dumps(active_jobs)
}
await asyncio.sleep(SSE_UPDATE_INTERVAL)
except asyncio.CancelledError:
pass
return EventSourceResponse(event_generator())
# ========== SYSTEM HEALTH API ENDPOINTS ==========
@app.get("/api/system/health")
async def get_system_health():
"""Get current system health status"""
return filesystem_health_checker.get_status()
@app.get("/api/system/health/stream")
async def system_health_stream(request: Request):
"""Server-Sent Events endpoint for real-time health status updates"""
async def event_generator():
try:
while True:
# Check if client is still connected
if await request.is_disconnected():
break
# Send current health status
status = filesystem_health_checker.get_status()
yield {
"event": "health",
"data": json.dumps(status)
}
# Check every 5 seconds
await asyncio.sleep(5)
except asyncio.CancelledError:
pass
return EventSourceResponse(event_generator())
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)