Upgrade to PyTorch 2.6.0 and enhance GPU reset script with Ollama management
- Upgrade PyTorch and torchaudio to 2.6.0 with CUDA 12.4 support - Update GPU reset script to gracefully stop/start Ollama via supervisorctl - Add Docker Compose configuration for both API and MCP server modes - Implement comprehensive Docker entrypoint for multi-mode deployment - Add GPU health check cleanup to prevent memory leaks - Fix transcription memory management with proper resource cleanup - Add filename security validation to prevent path traversal attacks - Include .dockerignore for optimized Docker builds - Remove deprecated supervisor configuration 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
60
.dockerignore
Normal file
60
.dockerignore
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
.venv
|
||||||
|
|
||||||
|
# Project specific
|
||||||
|
logs/
|
||||||
|
outputs/
|
||||||
|
models/
|
||||||
|
*.log
|
||||||
|
*.logs
|
||||||
|
mcp.logs
|
||||||
|
api.logs
|
||||||
|
|
||||||
|
# Git
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
.github/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
.dockerignore
|
||||||
|
docker-compose.yml
|
||||||
|
docker-compose.*.yml
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
*.tmp
|
||||||
|
*.temp
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Documentation (optional - uncomment if you want to exclude)
|
||||||
|
# README.md
|
||||||
|
# CLAUDE.md
|
||||||
|
# IMPLEMENTATION_PLAN.md
|
||||||
|
|
||||||
|
# Scripts (already in container)
|
||||||
|
# reset_gpu.sh - NEEDED for GPU health checks
|
||||||
|
run_api_server.sh
|
||||||
|
run_mcp_server.sh
|
||||||
|
|
||||||
|
# Supervisor config (not needed in container)
|
||||||
|
supervisor/
|
||||||
93
Dockerfile
93
Dockerfile
@@ -1,25 +1,34 @@
|
|||||||
# Use NVIDIA CUDA base image with Python
|
# Multi-purpose Whisper Transcriptor Docker Image
|
||||||
FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
# Supports both MCP Server and REST API Server modes
|
||||||
|
# Use SERVER_MODE environment variable to select: "mcp" or "api"
|
||||||
|
|
||||||
# Install Python 3.12
|
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
# Prevent interactive prompts during installation
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
software-properties-common \
|
software-properties-common \
|
||||||
|
curl \
|
||||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
&& apt-get update && apt-get install -y \
|
&& apt-get update && apt-get install -y \
|
||||||
python3.12 \
|
python3.12 \
|
||||||
python3.12-venv \
|
python3.12-venv \
|
||||||
python3.12-dev \
|
python3.12-dev \
|
||||||
python3-pip \
|
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
git \
|
git \
|
||||||
|
nginx \
|
||||||
|
supervisor \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Make python3.12 the default
|
# Make python3.12 the default
|
||||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
|
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \
|
||||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
|
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
|
||||||
|
|
||||||
# Upgrade pip
|
# Install pip using ensurepip (Python 3.12+ doesn't have distutils)
|
||||||
RUN python -m pip install --upgrade pip
|
RUN python -m ensurepip --upgrade && \
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
|
||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
@@ -27,30 +36,68 @@ WORKDIR /app
|
|||||||
# Copy requirements first for better caching
|
# Copy requirements first for better caching
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
|
||||||
# Install Python dependencies with CUDA support
|
# Install Python dependencies with CUDA 12.4 support
|
||||||
RUN pip install --no-cache-dir \
|
RUN pip install --no-cache-dir \
|
||||||
|
torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124 && \
|
||||||
|
pip install --no-cache-dir \
|
||||||
|
torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124 && \
|
||||||
|
pip install --no-cache-dir \
|
||||||
faster-whisper \
|
faster-whisper \
|
||||||
torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 \
|
fastapi>=0.115.0 \
|
||||||
torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 \
|
uvicorn[standard]>=0.32.0 \
|
||||||
mcp[cli]
|
python-multipart>=0.0.9 \
|
||||||
|
aiofiles>=23.0.0 \
|
||||||
|
mcp[cli]>=1.2.0 \
|
||||||
|
gTTS>=2.3.0 \
|
||||||
|
pyttsx3>=2.90 \
|
||||||
|
scipy>=1.10.0 \
|
||||||
|
numpy>=1.24.0
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY src/ ./src/
|
COPY src/ ./src/
|
||||||
COPY pyproject.toml .
|
COPY pyproject.toml .
|
||||||
COPY README.md .
|
|
||||||
|
|
||||||
# Create directories for models and outputs
|
# Copy test audio file for GPU health checks
|
||||||
RUN mkdir -p /models /outputs
|
COPY test.mp3 .
|
||||||
|
|
||||||
|
# Copy nginx configuration
|
||||||
|
COPY nginx/transcriptor.conf /etc/nginx/sites-available/transcriptor.conf
|
||||||
|
|
||||||
|
# Copy entrypoint script and GPU reset script
|
||||||
|
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||||
|
COPY reset_gpu.sh /app/reset_gpu.sh
|
||||||
|
RUN chmod +x /docker-entrypoint.sh /app/reset_gpu.sh
|
||||||
|
|
||||||
|
# Create directories for models, outputs, and logs
|
||||||
|
RUN mkdir -p /models /outputs /logs /app/outputs/uploads /app/outputs/batch /app/outputs/jobs
|
||||||
|
|
||||||
# Set Python path
|
# Set Python path
|
||||||
ENV PYTHONPATH=/app/src
|
ENV PYTHONPATH=/app/src
|
||||||
|
|
||||||
# Set environment variables for GPU
|
# Default environment variables (can be overridden)
|
||||||
ENV WHISPER_MODEL_DIR=/models
|
ENV WHISPER_MODEL_DIR=/models \
|
||||||
ENV TRANSCRIPTION_OUTPUT_DIR=/outputs
|
TRANSCRIPTION_OUTPUT_DIR=/outputs \
|
||||||
ENV TRANSCRIPTION_MODEL=large-v3
|
TRANSCRIPTION_BATCH_OUTPUT_DIR=/outputs/batch \
|
||||||
ENV TRANSCRIPTION_DEVICE=cuda
|
TRANSCRIPTION_MODEL=large-v3 \
|
||||||
ENV TRANSCRIPTION_COMPUTE_TYPE=float16
|
TRANSCRIPTION_DEVICE=auto \
|
||||||
|
TRANSCRIPTION_COMPUTE_TYPE=auto \
|
||||||
|
TRANSCRIPTION_OUTPUT_FORMAT=txt \
|
||||||
|
TRANSCRIPTION_BEAM_SIZE=5 \
|
||||||
|
TRANSCRIPTION_TEMPERATURE=0.0 \
|
||||||
|
API_HOST=127.0.0.1 \
|
||||||
|
API_PORT=33767 \
|
||||||
|
JOB_QUEUE_MAX_SIZE=5 \
|
||||||
|
JOB_METADATA_DIR=/outputs/jobs \
|
||||||
|
JOB_RETENTION_DAYS=7 \
|
||||||
|
GPU_HEALTH_CHECK_ENABLED=true \
|
||||||
|
GPU_HEALTH_CHECK_INTERVAL_MINUTES=10 \
|
||||||
|
GPU_HEALTH_TEST_MODEL=tiny \
|
||||||
|
GPU_HEALTH_TEST_AUDIO=/test-audio/test.mp3 \
|
||||||
|
GPU_RESET_COOLDOWN_MINUTES=5 \
|
||||||
|
SERVER_MODE=api
|
||||||
|
|
||||||
# Run the server
|
# Expose port 80 for nginx (API mode only)
|
||||||
CMD ["python", "src/servers/whisper_server.py"]
|
EXPOSE 80
|
||||||
|
|
||||||
|
# Use entrypoint script to handle different server modes
|
||||||
|
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||||
|
|||||||
403
TRANSCRIPTOR_API_FIX.md
Normal file
403
TRANSCRIPTOR_API_FIX.md
Normal file
@@ -0,0 +1,403 @@
|
|||||||
|
# Transcriptor API - Filename Validation Bug Fix
|
||||||
|
|
||||||
|
## Issue Summary
|
||||||
|
|
||||||
|
The transcriptor API is rejecting valid audio files due to overly strict path validation. Files with `..` (double periods) anywhere in the filename are being rejected as potential path traversal attacks, even when they appear naturally in legitimate filenames.
|
||||||
|
|
||||||
|
## Current Behavior
|
||||||
|
|
||||||
|
### Error Observed
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"detail": {
|
||||||
|
"error": "Upload failed",
|
||||||
|
"message": "Audio file validation failed: Path traversal (..) is not allowed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### HTTP Response
|
||||||
|
- **Status Code**: 500
|
||||||
|
- **Endpoint**: `POST /transcribe`
|
||||||
|
- **Request**: File upload with filename containing `..`
|
||||||
|
|
||||||
|
### Example Failing Filename
|
||||||
|
```
|
||||||
|
This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a
|
||||||
|
^^^
|
||||||
|
(Three dots, parsed as "..")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Root Cause Analysis
|
||||||
|
|
||||||
|
### Current Validation Logic (Problematic)
|
||||||
|
The API is likely checking for `..` anywhere in the filename string, which creates false positives:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# CURRENT (WRONG)
|
||||||
|
if ".." in filename:
|
||||||
|
raise ValidationError("Path traversal (..) is not allowed")
|
||||||
|
```
|
||||||
|
|
||||||
|
This rejects legitimate filenames like:
|
||||||
|
- `"video...mp4"` (ellipsis in title)
|
||||||
|
- `"Part 1... Part 2.m4a"` (ellipsis separator)
|
||||||
|
- `"Wait... what.mp4"` (dramatic pause)
|
||||||
|
|
||||||
|
### Actual Security Concern
|
||||||
|
Path traversal attacks use `..` as **directory separators** to navigate up the filesystem:
|
||||||
|
- `../../etc/passwd` (DANGEROUS)
|
||||||
|
- `../../../secrets.txt` (DANGEROUS)
|
||||||
|
- `video...mp4` (SAFE - just a filename)
|
||||||
|
|
||||||
|
## Recommended Fix
|
||||||
|
|
||||||
|
### Option 1: Path Component Validation (Recommended)
|
||||||
|
|
||||||
|
Check for `..` only when it appears as a **complete path component**, not as part of the filename text.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def validate_filename(filename: str) -> bool:
|
||||||
|
"""
|
||||||
|
Validate filename for path traversal attacks.
|
||||||
|
|
||||||
|
Returns True if safe, raises ValidationError if dangerous.
|
||||||
|
"""
|
||||||
|
# Normalize the path
|
||||||
|
normalized = os.path.normpath(filename)
|
||||||
|
|
||||||
|
# Check if normalization changed the path (indicates traversal)
|
||||||
|
if normalized != filename:
|
||||||
|
raise ValidationError(f"Path traversal detected: {filename}")
|
||||||
|
|
||||||
|
# Check for absolute paths
|
||||||
|
if os.path.isabs(filename):
|
||||||
|
raise ValidationError(f"Absolute paths not allowed: {filename}")
|
||||||
|
|
||||||
|
# Split into components and check for parent directory references
|
||||||
|
parts = Path(filename).parts
|
||||||
|
if ".." in parts:
|
||||||
|
raise ValidationError(f"Parent directory references not allowed: {filename}")
|
||||||
|
|
||||||
|
# Check for any path separators (should be basename only)
|
||||||
|
if os.sep in filename or (os.altsep and os.altsep in filename):
|
||||||
|
raise ValidationError(f"Path separators not allowed: {filename}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Examples:
|
||||||
|
validate_filename("video.mp4") # ✓ PASS
|
||||||
|
validate_filename("video...mp4") # ✓ PASS (ellipsis)
|
||||||
|
validate_filename("This is... a video.m4a") # ✓ PASS
|
||||||
|
validate_filename("../../../etc/passwd") # ✗ FAIL (traversal)
|
||||||
|
validate_filename("dir/../file.mp4") # ✗ FAIL (traversal)
|
||||||
|
validate_filename("/etc/passwd") # ✗ FAIL (absolute)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: Basename-Only Validation (Simpler)
|
||||||
|
|
||||||
|
Only accept basenames (no directory components at all):
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
|
||||||
|
def validate_filename(filename: str) -> bool:
|
||||||
|
"""
|
||||||
|
Ensure filename contains no path components.
|
||||||
|
"""
|
||||||
|
# Extract basename
|
||||||
|
basename = os.path.basename(filename)
|
||||||
|
|
||||||
|
# Must match original (no path components)
|
||||||
|
if basename != filename:
|
||||||
|
raise ValidationError(f"Filename must not contain path components: {filename}")
|
||||||
|
|
||||||
|
# Additional check: no path separators
|
||||||
|
if "/" in filename or "\\" in filename:
|
||||||
|
raise ValidationError(f"Path separators not allowed: {filename}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Examples:
|
||||||
|
validate_filename("video.mp4") # ✓ PASS
|
||||||
|
validate_filename("video...mp4") # ✓ PASS
|
||||||
|
validate_filename("../file.mp4") # ✗ FAIL
|
||||||
|
validate_filename("dir/file.mp4") # ✗ FAIL
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Regex Pattern Matching (Most Strict)
|
||||||
|
|
||||||
|
Use a whitelist approach for allowed characters:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import re
|
||||||
|
|
||||||
|
def validate_filename(filename: str) -> bool:
|
||||||
|
"""
|
||||||
|
Validate filename using whitelist of safe characters.
|
||||||
|
"""
|
||||||
|
# Allow: letters, numbers, spaces, dots, hyphens, underscores
|
||||||
|
# Length: 1-255 characters
|
||||||
|
pattern = r'^[a-zA-Z0-9 .\-_]{1,255}\.[a-zA-Z0-9]{2,10}$'
|
||||||
|
|
||||||
|
if not re.match(pattern, filename):
|
||||||
|
raise ValidationError(f"Invalid filename format: {filename}")
|
||||||
|
|
||||||
|
# Additional safety: reject if starts/ends with dot
|
||||||
|
if filename.startswith('.') or filename.endswith('.'):
|
||||||
|
raise ValidationError(f"Filename cannot start or end with dot: {filename}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Examples:
|
||||||
|
validate_filename("video.mp4") # ✓ PASS
|
||||||
|
validate_filename("video...mp4") # ✓ PASS
|
||||||
|
validate_filename("My Video... Part 2.m4a") # ✓ PASS
|
||||||
|
validate_filename("../file.mp4") # ✗ FAIL (starts with ..)
|
||||||
|
validate_filename("file<>.mp4") # ✗ FAIL (invalid chars)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### 1. Locate Current Validation Code
|
||||||
|
|
||||||
|
Search for files containing the validation logic:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
grep -r "Path traversal" /path/to/transcriptor-api
|
||||||
|
grep -r '".."' /path/to/transcriptor-api
|
||||||
|
grep -r "normpath\|basename" /path/to/transcriptor-api
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Update Validation Function
|
||||||
|
|
||||||
|
Replace the current naive check with one of the recommended solutions above.
|
||||||
|
|
||||||
|
**Priority Order:**
|
||||||
|
1. **Option 1** (Path Component Validation) - Best security/usability balance
|
||||||
|
2. **Option 2** (Basename-Only) - Simplest, very secure
|
||||||
|
3. **Option 3** (Regex) - Most restrictive, may reject valid files
|
||||||
|
|
||||||
|
### 3. Test Cases
|
||||||
|
|
||||||
|
Create comprehensive test suite:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
def test_valid_filenames():
|
||||||
|
"""Test filenames that should be accepted."""
|
||||||
|
valid_names = [
|
||||||
|
"video.mp4",
|
||||||
|
"audio.m4a",
|
||||||
|
"This is... a test.mp4",
|
||||||
|
"Part 1... Part 2.wav",
|
||||||
|
"video...multiple...dots.mp3",
|
||||||
|
"My-Video_2024.mp4",
|
||||||
|
"song (remix).m4a",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in valid_names:
|
||||||
|
assert validate_filename(filename), f"Should accept: {filename}"
|
||||||
|
|
||||||
|
def test_dangerous_filenames():
|
||||||
|
"""Test filenames that should be rejected."""
|
||||||
|
dangerous_names = [
|
||||||
|
"../../../etc/passwd",
|
||||||
|
"../../secrets.txt",
|
||||||
|
"../file.mp4",
|
||||||
|
"/etc/passwd",
|
||||||
|
"C:\\Windows\\System32\\file.txt",
|
||||||
|
"dir/../file.mp4",
|
||||||
|
"file/../../etc/passwd",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in dangerous_names:
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
validate_filename(filename)
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test edge cases."""
|
||||||
|
edge_cases = [
|
||||||
|
(".", False), # Current directory
|
||||||
|
("..", False), # Parent directory
|
||||||
|
("...", True), # Just dots (valid)
|
||||||
|
("....", True), # Multiple dots (valid)
|
||||||
|
(".hidden.mp4", True), # Hidden file (valid on Unix)
|
||||||
|
("", False), # Empty string
|
||||||
|
("a" * 256, False), # Too long
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename, should_pass in edge_cases:
|
||||||
|
if should_pass:
|
||||||
|
assert validate_filename(filename)
|
||||||
|
else:
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
validate_filename(filename)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Update Error Response
|
||||||
|
|
||||||
|
Provide clearer error messages:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# BAD (current)
|
||||||
|
{"detail": {"error": "Upload failed", "message": "Audio file validation failed: Path traversal (..) is not allowed"}}
|
||||||
|
|
||||||
|
# GOOD (improved)
|
||||||
|
{
|
||||||
|
"detail": {
|
||||||
|
"error": "Invalid filename",
|
||||||
|
"message": "Filename contains path traversal characters. Please use only the filename without directory paths.",
|
||||||
|
"filename": "../../etc/passwd",
|
||||||
|
"suggestion": "Use: passwd.txt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing the Fix
|
||||||
|
|
||||||
|
### Manual Testing
|
||||||
|
|
||||||
|
1. **Test with problematic filename from bug report:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://192.168.1.210:33767/transcribe \
|
||||||
|
-F "file=@/path/to/This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a" \
|
||||||
|
-F "model=medium"
|
||||||
|
```
|
||||||
|
Expected: HTTP 200 (success)
|
||||||
|
|
||||||
|
2. **Test with actual path traversal:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://192.168.1.210:33767/transcribe \
|
||||||
|
-F "file=@/tmp/test.m4a;filename=../../etc/passwd" \
|
||||||
|
-F "model=medium"
|
||||||
|
```
|
||||||
|
Expected: HTTP 400 (validation error)
|
||||||
|
|
||||||
|
3. **Test with various ellipsis patterns:**
|
||||||
|
- `"video...mp4"` → Should pass
|
||||||
|
- `"Part 1... Part 2.m4a"` → Should pass
|
||||||
|
- `"Wait... what!.mp4"` → Should pass
|
||||||
|
|
||||||
|
### Automated Testing
|
||||||
|
|
||||||
|
```python
|
||||||
|
# integration_test.py
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def test_ellipsis_filenames():
|
||||||
|
"""Test files with ellipsis in names."""
|
||||||
|
test_cases = [
|
||||||
|
"video...mp4",
|
||||||
|
"This is... a test.m4a",
|
||||||
|
"Wait... what.mp3",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in test_cases:
|
||||||
|
response = requests.post(
|
||||||
|
"http://192.168.1.210:33767/transcribe",
|
||||||
|
files={"file": (filename, open("test_audio.m4a", "rb"))},
|
||||||
|
data={"model": "medium"}
|
||||||
|
)
|
||||||
|
assert response.status_code == 200, f"Failed for: {filename}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### What We're Protecting Against
|
||||||
|
|
||||||
|
1. **Path Traversal**: `../../../sensitive/file`
|
||||||
|
2. **Absolute Paths**: `/etc/passwd` or `C:\Windows\System32\`
|
||||||
|
3. **Hidden Paths**: `./.git/config`
|
||||||
|
|
||||||
|
### What We're NOT Breaking
|
||||||
|
|
||||||
|
1. **Ellipsis in titles**: `"Wait... what.mp4"`
|
||||||
|
2. **Multiple extensions**: `"file.tar.gz"`
|
||||||
|
3. **Special characters**: `"My Video (2024).mp4"`
|
||||||
|
|
||||||
|
### Additional Hardening (Optional)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def sanitize_and_validate_filename(filename: str) -> str:
|
||||||
|
"""
|
||||||
|
Sanitize filename and validate for safety.
|
||||||
|
Returns cleaned filename or raises error.
|
||||||
|
"""
|
||||||
|
# Remove null bytes
|
||||||
|
filename = filename.replace("\0", "")
|
||||||
|
|
||||||
|
# Extract basename (strips any path components)
|
||||||
|
filename = os.path.basename(filename)
|
||||||
|
|
||||||
|
# Limit length
|
||||||
|
max_length = 255
|
||||||
|
if len(filename) > max_length:
|
||||||
|
name, ext = os.path.splitext(filename)
|
||||||
|
filename = name[:max_length-len(ext)] + ext
|
||||||
|
|
||||||
|
# Validate
|
||||||
|
validate_filename(filename)
|
||||||
|
|
||||||
|
return filename
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deployment Checklist
|
||||||
|
|
||||||
|
- [ ] Update validation function with recommended fix
|
||||||
|
- [ ] Add comprehensive test suite
|
||||||
|
- [ ] Test with real-world filenames (including bug report case)
|
||||||
|
- [ ] Test security: attempt path traversal attacks
|
||||||
|
- [ ] Update API documentation
|
||||||
|
- [ ] Review error messages for clarity
|
||||||
|
- [ ] Deploy to staging environment
|
||||||
|
- [ ] Run integration tests
|
||||||
|
- [ ] Monitor logs for validation failures
|
||||||
|
- [ ] Deploy to production
|
||||||
|
- [ ] Verify bug reporter's file now works
|
||||||
|
|
||||||
|
## Contact & Context
|
||||||
|
|
||||||
|
**Bug Report Date**: 2025-10-26
|
||||||
|
**Affected Endpoint**: `POST /transcribe`
|
||||||
|
**Error Code**: HTTP 500
|
||||||
|
**Client Application**: yt-dlp-webui v3
|
||||||
|
|
||||||
|
**Example Failing Request:**
|
||||||
|
```
|
||||||
|
POST http://192.168.1.210:33767/transcribe
|
||||||
|
Content-Type: multipart/form-data
|
||||||
|
|
||||||
|
file: "This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a"
|
||||||
|
model: "medium"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Current Behavior**: Returns 500 error with path traversal message
|
||||||
|
**Expected Behavior**: Accepts file and processes transcription
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
### Files to Check
|
||||||
|
- `/path/to/api/validators.py` or similar
|
||||||
|
- `/path/to/api/upload_handler.py`
|
||||||
|
- `/path/to/api/routes/transcribe.py`
|
||||||
|
|
||||||
|
### Search Commands
|
||||||
|
```bash
|
||||||
|
# Find validation code
|
||||||
|
rg "Path traversal" --type py
|
||||||
|
rg '"\.\."' --type py
|
||||||
|
rg "ValidationError.*filename" --type py
|
||||||
|
|
||||||
|
# Find upload handlers
|
||||||
|
rg "def.*upload|def.*transcribe" --type py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Priority Fix
|
||||||
|
Use **Option 1 (Path Component Validation)** - it provides the best balance of security and usability.
|
||||||
19
docker-build.sh
Executable file
19
docker-build.sh
Executable file
@@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
datetime_prefix() {
|
||||||
|
date "+[%Y-%m-%d %H:%M:%S]"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "$(datetime_prefix) Building Whisper Transcriptor Docker image..."
|
||||||
|
|
||||||
|
# Build the Docker image
|
||||||
|
docker build -t transcriptor-apimcp:latest .
|
||||||
|
|
||||||
|
echo "$(datetime_prefix) Build complete!"
|
||||||
|
echo "$(datetime_prefix) Image: transcriptor-apimcp:latest"
|
||||||
|
echo ""
|
||||||
|
echo "Usage:"
|
||||||
|
echo " API mode: ./docker-run-api.sh"
|
||||||
|
echo " MCP mode: ./docker-run-mcp.sh"
|
||||||
|
echo " Or use: docker-compose up transcriptor-api"
|
||||||
106
docker-compose.yml
Normal file
106
docker-compose.yml
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
# API Server mode with nginx reverse proxy
|
||||||
|
transcriptor-api:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: transcriptor-apimcp:latest
|
||||||
|
container_name: transcriptor-api
|
||||||
|
runtime: nvidia
|
||||||
|
environment:
|
||||||
|
NVIDIA_VISIBLE_DEVICES: "0"
|
||||||
|
NVIDIA_DRIVER_CAPABILITIES: compute,utility
|
||||||
|
SERVER_MODE: api
|
||||||
|
API_HOST: 127.0.0.1
|
||||||
|
API_PORT: 33767
|
||||||
|
WHISPER_MODEL_DIR: /models
|
||||||
|
TRANSCRIPTION_OUTPUT_DIR: /outputs
|
||||||
|
TRANSCRIPTION_BATCH_OUTPUT_DIR: /outputs/batch
|
||||||
|
TRANSCRIPTION_MODEL: large-v3
|
||||||
|
TRANSCRIPTION_DEVICE: auto
|
||||||
|
TRANSCRIPTION_COMPUTE_TYPE: auto
|
||||||
|
TRANSCRIPTION_OUTPUT_FORMAT: txt
|
||||||
|
TRANSCRIPTION_BEAM_SIZE: 5
|
||||||
|
TRANSCRIPTION_TEMPERATURE: 0.0
|
||||||
|
JOB_QUEUE_MAX_SIZE: 5
|
||||||
|
JOB_METADATA_DIR: /outputs/jobs
|
||||||
|
JOB_RETENTION_DAYS: 7
|
||||||
|
GPU_HEALTH_CHECK_ENABLED: "true"
|
||||||
|
GPU_HEALTH_CHECK_INTERVAL_MINUTES: 10
|
||||||
|
GPU_HEALTH_TEST_MODEL: tiny
|
||||||
|
GPU_HEALTH_TEST_AUDIO: /test-audio/test.mp3
|
||||||
|
GPU_RESET_COOLDOWN_MINUTES: 5
|
||||||
|
# Optional proxy settings (uncomment if needed)
|
||||||
|
# HTTP_PROXY: http://192.168.1.212:8080
|
||||||
|
# HTTPS_PROXY: http://192.168.1.212:8080
|
||||||
|
ports:
|
||||||
|
- "33767:80" # Map host:33767 to container nginx:80
|
||||||
|
volumes:
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/models:/models
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/outputs:/outputs
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/logs:/logs
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/data/test.mp3:/test-audio/test.mp3:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro # Sync container time with host
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- transcriptor-network
|
||||||
|
|
||||||
|
# MCP Server mode (stdio based)
|
||||||
|
transcriptor-mcp:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: transcriptor-apimcp:latest
|
||||||
|
container_name: transcriptor-mcp
|
||||||
|
environment:
|
||||||
|
SERVER_MODE: mcp
|
||||||
|
WHISPER_MODEL_DIR: /models
|
||||||
|
TRANSCRIPTION_OUTPUT_DIR: /outputs
|
||||||
|
TRANSCRIPTION_BATCH_OUTPUT_DIR: /outputs/batch
|
||||||
|
TRANSCRIPTION_MODEL: large-v3
|
||||||
|
TRANSCRIPTION_DEVICE: auto
|
||||||
|
TRANSCRIPTION_COMPUTE_TYPE: auto
|
||||||
|
TRANSCRIPTION_OUTPUT_FORMAT: txt
|
||||||
|
TRANSCRIPTION_BEAM_SIZE: 5
|
||||||
|
TRANSCRIPTION_TEMPERATURE: 0.0
|
||||||
|
JOB_QUEUE_MAX_SIZE: 100
|
||||||
|
JOB_METADATA_DIR: /outputs/jobs
|
||||||
|
JOB_RETENTION_DAYS: 7
|
||||||
|
GPU_HEALTH_CHECK_ENABLED: "true"
|
||||||
|
GPU_HEALTH_CHECK_INTERVAL_MINUTES: 10
|
||||||
|
GPU_HEALTH_TEST_MODEL: tiny
|
||||||
|
GPU_RESET_COOLDOWN_MINUTES: 5
|
||||||
|
# Optional proxy settings (uncomment if needed)
|
||||||
|
# HTTP_PROXY: http://192.168.1.212:8080
|
||||||
|
# HTTPS_PROXY: http://192.168.1.212:8080
|
||||||
|
volumes:
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/models:/models
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/outputs:/outputs
|
||||||
|
- /home/uad/agents/tools/mcp-transcriptor/logs:/logs
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
|
stdin_open: true # Enable stdin for MCP stdio mode
|
||||||
|
tty: true
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- transcriptor-network
|
||||||
|
profiles:
|
||||||
|
- mcp # Only start when explicitly requested
|
||||||
|
|
||||||
|
networks:
|
||||||
|
transcriptor-network:
|
||||||
|
driver: bridge
|
||||||
67
docker-entrypoint.sh
Executable file
67
docker-entrypoint.sh
Executable file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Docker Entrypoint Script for Whisper Transcriptor
|
||||||
|
# Supports both MCP and API server modes
|
||||||
|
|
||||||
|
datetime_prefix() {
|
||||||
|
date "+[%Y-%m-%d %H:%M:%S]"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "$(datetime_prefix) Starting Whisper Transcriptor in ${SERVER_MODE} mode..."
|
||||||
|
|
||||||
|
# Ensure required directories exist
|
||||||
|
mkdir -p "$WHISPER_MODEL_DIR"
|
||||||
|
mkdir -p "$TRANSCRIPTION_OUTPUT_DIR"
|
||||||
|
mkdir -p "$TRANSCRIPTION_BATCH_OUTPUT_DIR"
|
||||||
|
mkdir -p "$JOB_METADATA_DIR"
|
||||||
|
mkdir -p /app/outputs/uploads
|
||||||
|
|
||||||
|
# Display GPU information
|
||||||
|
if command -v nvidia-smi &> /dev/null; then
|
||||||
|
echo "$(datetime_prefix) GPU Information:"
|
||||||
|
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||||
|
else
|
||||||
|
echo "$(datetime_prefix) Warning: nvidia-smi not found. GPU may not be available."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check server mode and start appropriate service
|
||||||
|
case "${SERVER_MODE}" in
|
||||||
|
"api")
|
||||||
|
echo "$(datetime_prefix) Starting API Server mode with nginx reverse proxy"
|
||||||
|
|
||||||
|
# Update nginx configuration to use correct backend
|
||||||
|
sed -i "s/server 127.0.0.1:33767;/server ${API_HOST}:${API_PORT};/" /etc/nginx/sites-available/transcriptor.conf
|
||||||
|
|
||||||
|
# Enable nginx site
|
||||||
|
ln -sf /etc/nginx/sites-available/transcriptor.conf /etc/nginx/sites-enabled/
|
||||||
|
rm -f /etc/nginx/sites-enabled/default
|
||||||
|
|
||||||
|
# Test nginx configuration
|
||||||
|
echo "$(datetime_prefix) Testing nginx configuration..."
|
||||||
|
nginx -t
|
||||||
|
|
||||||
|
# Start nginx in background
|
||||||
|
echo "$(datetime_prefix) Starting nginx..."
|
||||||
|
nginx
|
||||||
|
|
||||||
|
# Start API server (foreground - this keeps container running)
|
||||||
|
echo "$(datetime_prefix) Starting API server on ${API_HOST}:${API_PORT}"
|
||||||
|
echo "$(datetime_prefix) API accessible via nginx on port 80"
|
||||||
|
exec python -u /app/src/servers/api_server.py
|
||||||
|
;;
|
||||||
|
|
||||||
|
"mcp")
|
||||||
|
echo "$(datetime_prefix) Starting MCP Server mode (stdio)"
|
||||||
|
echo "$(datetime_prefix) Model directory: $WHISPER_MODEL_DIR"
|
||||||
|
|
||||||
|
# Start MCP server in stdio mode
|
||||||
|
exec python -u /app/src/servers/whisper_server.py
|
||||||
|
;;
|
||||||
|
|
||||||
|
*)
|
||||||
|
echo "$(datetime_prefix) ERROR: Invalid SERVER_MODE: ${SERVER_MODE}"
|
||||||
|
echo "$(datetime_prefix) Valid modes: 'api' or 'mcp'"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
62
docker-run-api.sh
Executable file
62
docker-run-api.sh
Executable file
@@ -0,0 +1,62 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
datetime_prefix() {
|
||||||
|
date "+[%Y-%m-%d %H:%M:%S]"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "$(datetime_prefix) Starting Whisper Transcriptor in API mode with nginx..."
|
||||||
|
|
||||||
|
# Check if image exists
|
||||||
|
if ! docker image inspect transcriptor-apimcp:latest &> /dev/null; then
|
||||||
|
echo "$(datetime_prefix) Image not found. Building first..."
|
||||||
|
./docker-build.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Stop and remove existing container if running
|
||||||
|
if docker ps -a --format '{{.Names}}' | grep -q '^transcriptor-api$'; then
|
||||||
|
echo "$(datetime_prefix) Stopping existing container..."
|
||||||
|
docker stop transcriptor-api || true
|
||||||
|
docker rm transcriptor-api || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run the container in API mode
|
||||||
|
docker run -d \
|
||||||
|
--name transcriptor-api \
|
||||||
|
--gpus all \
|
||||||
|
-p 33767:80 \
|
||||||
|
-e SERVER_MODE=api \
|
||||||
|
-e API_HOST=127.0.0.1 \
|
||||||
|
-e API_PORT=33767 \
|
||||||
|
-e CUDA_VISIBLE_DEVICES=0 \
|
||||||
|
-e TRANSCRIPTION_MODEL=large-v3 \
|
||||||
|
-e TRANSCRIPTION_DEVICE=auto \
|
||||||
|
-e TRANSCRIPTION_COMPUTE_TYPE=auto \
|
||||||
|
-e JOB_QUEUE_MAX_SIZE=5 \
|
||||||
|
-v "$(pwd)/models:/models" \
|
||||||
|
-v "$(pwd)/outputs:/outputs" \
|
||||||
|
-v "$(pwd)/logs:/logs" \
|
||||||
|
--restart unless-stopped \
|
||||||
|
transcriptor-apimcp:latest
|
||||||
|
|
||||||
|
echo "$(datetime_prefix) Container started!"
|
||||||
|
echo ""
|
||||||
|
echo "API Server running at: http://localhost:33767"
|
||||||
|
echo ""
|
||||||
|
echo "Useful commands:"
|
||||||
|
echo " Check logs: docker logs -f transcriptor-api"
|
||||||
|
echo " Check status: docker ps | grep transcriptor-api"
|
||||||
|
echo " Test health: curl http://localhost:33767/health"
|
||||||
|
echo " Test GPU: curl http://localhost:33767/health/gpu"
|
||||||
|
echo " Stop container: docker stop transcriptor-api"
|
||||||
|
echo " Restart: docker restart transcriptor-api"
|
||||||
|
echo ""
|
||||||
|
echo "$(datetime_prefix) Waiting for service to start..."
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Test health endpoint
|
||||||
|
if curl -s http://localhost:33767/health > /dev/null 2>&1; then
|
||||||
|
echo "$(datetime_prefix) ✓ Service is healthy!"
|
||||||
|
else
|
||||||
|
echo "$(datetime_prefix) ⚠ Service not responding yet. Check logs with: docker logs transcriptor-api"
|
||||||
|
fi
|
||||||
40
docker-run-mcp.sh
Executable file
40
docker-run-mcp.sh
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
datetime_prefix() {
|
||||||
|
date "+[%Y-%m-%d %H:%M:%S]"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "$(datetime_prefix) Starting Whisper Transcriptor in MCP mode..."
|
||||||
|
|
||||||
|
# Check if image exists
|
||||||
|
if ! docker image inspect transcriptor-apimcp:latest &> /dev/null; then
|
||||||
|
echo "$(datetime_prefix) Image not found. Building first..."
|
||||||
|
./docker-build.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Stop and remove existing container if running
|
||||||
|
if docker ps -a --format '{{.Names}}' | grep -q '^transcriptor-mcp$'; then
|
||||||
|
echo "$(datetime_prefix) Stopping existing container..."
|
||||||
|
docker stop transcriptor-mcp || true
|
||||||
|
docker rm transcriptor-mcp || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run the container in MCP mode (interactive stdio)
|
||||||
|
echo "$(datetime_prefix) Starting MCP server in stdio mode..."
|
||||||
|
echo "$(datetime_prefix) Press Ctrl+C to stop"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
docker run -it --rm \
|
||||||
|
--name transcriptor-mcp \
|
||||||
|
--gpus all \
|
||||||
|
-e SERVER_MODE=mcp \
|
||||||
|
-e CUDA_VISIBLE_DEVICES=0 \
|
||||||
|
-e TRANSCRIPTION_MODEL=large-v3 \
|
||||||
|
-e TRANSCRIPTION_DEVICE=auto \
|
||||||
|
-e TRANSCRIPTION_COMPUTE_TYPE=auto \
|
||||||
|
-e JOB_QUEUE_MAX_SIZE=100 \
|
||||||
|
-v "$(pwd)/models:/models" \
|
||||||
|
-v "$(pwd)/outputs:/outputs" \
|
||||||
|
-v "$(pwd)/logs:/logs" \
|
||||||
|
transcriptor-apimcp:latest
|
||||||
28
reset_gpu.sh
28
reset_gpu.sh
@@ -2,12 +2,28 @@
|
|||||||
|
|
||||||
# Script to reset NVIDIA GPU drivers without rebooting
|
# Script to reset NVIDIA GPU drivers without rebooting
|
||||||
# This reloads kernel modules and restarts nvidia-persistenced service
|
# This reloads kernel modules and restarts nvidia-persistenced service
|
||||||
|
# Also handles stopping/starting Ollama to release GPU resources
|
||||||
|
|
||||||
echo "============================================================"
|
echo "============================================================"
|
||||||
echo "NVIDIA GPU Driver Reset Script"
|
echo "NVIDIA GPU Driver Reset Script"
|
||||||
echo "============================================================"
|
echo "============================================================"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
# Stop Ollama via supervisorctl
|
||||||
|
echo "Stopping Ollama service..."
|
||||||
|
sudo supervisorctl stop ollama 2>/dev/null
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "✓ Ollama stopped via supervisorctl"
|
||||||
|
OLLAMA_WAS_RUNNING=true
|
||||||
|
else
|
||||||
|
echo " Ollama not running or supervisorctl not available"
|
||||||
|
OLLAMA_WAS_RUNNING=false
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Give Ollama time to release GPU resources
|
||||||
|
sleep 2
|
||||||
|
|
||||||
# Stop nvidia-persistenced service
|
# Stop nvidia-persistenced service
|
||||||
echo "Stopping nvidia-persistenced service..."
|
echo "Stopping nvidia-persistenced service..."
|
||||||
sudo systemctl stop nvidia-persistenced
|
sudo systemctl stop nvidia-persistenced
|
||||||
@@ -65,6 +81,18 @@ else
|
|||||||
fi
|
fi
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
# Restart Ollama if it was running
|
||||||
|
if [ "$OLLAMA_WAS_RUNNING" = true ]; then
|
||||||
|
echo "Restarting Ollama service..."
|
||||||
|
sudo supervisorctl start ollama
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "✓ Ollama restarted"
|
||||||
|
else
|
||||||
|
echo "✗ Failed to restart Ollama"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
echo "============================================================"
|
echo "============================================================"
|
||||||
echo "GPU driver reset completed successfully"
|
echo "GPU driver reset completed successfully"
|
||||||
echo "============================================================"
|
echo "============================================================"
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ with strict failure handling to prevent silent CPU fallbacks.
|
|||||||
Includes circuit breaker pattern to prevent repeated failed checks.
|
Includes circuit breaker pattern to prevent repeated failed checks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
import threading
|
import threading
|
||||||
@@ -14,7 +15,6 @@ from datetime import datetime
|
|||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from utils.test_audio_generator import generate_test_audio
|
|
||||||
from utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpen
|
from utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpen
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -109,8 +109,18 @@ def _check_gpu_health_internal(expected_device: str = "auto") -> GPUHealthStatus
|
|||||||
logger.warning(f"Failed to get GPU info: {e}")
|
logger.warning(f"Failed to get GPU info: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Generate test audio
|
# Get test audio path from environment variable
|
||||||
test_audio_path = generate_test_audio(duration_seconds=1.0)
|
test_audio_path = os.getenv("GPU_HEALTH_TEST_AUDIO")
|
||||||
|
|
||||||
|
if not test_audio_path:
|
||||||
|
raise ValueError("GPU_HEALTH_TEST_AUDIO environment variable not set")
|
||||||
|
|
||||||
|
# Verify test audio file exists
|
||||||
|
if not os.path.exists(test_audio_path):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Test audio file not found: {test_audio_path}. "
|
||||||
|
f"Please ensure test audio exists before running GPU health checks."
|
||||||
|
)
|
||||||
|
|
||||||
# Import here to avoid circular dependencies
|
# Import here to avoid circular dependencies
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
@@ -129,6 +139,7 @@ def _check_gpu_health_internal(expected_device: str = "auto") -> GPUHealthStatus
|
|||||||
gpu_memory_before = torch.cuda.memory_allocated(0)
|
gpu_memory_before = torch.cuda.memory_allocated(0)
|
||||||
|
|
||||||
# Load tiny model and transcribe
|
# Load tiny model and transcribe
|
||||||
|
model = None
|
||||||
try:
|
try:
|
||||||
model = WhisperModel(
|
model = WhisperModel(
|
||||||
"tiny",
|
"tiny",
|
||||||
@@ -140,7 +151,7 @@ def _check_gpu_health_internal(expected_device: str = "auto") -> GPUHealthStatus
|
|||||||
segments, info = model.transcribe(test_audio_path, beam_size=1)
|
segments, info = model.transcribe(test_audio_path, beam_size=1)
|
||||||
|
|
||||||
# Consume segments (needed to actually run inference)
|
# Consume segments (needed to actually run inference)
|
||||||
list(segments)
|
segments_list = list(segments)
|
||||||
|
|
||||||
# Check if GPU was actually used
|
# Check if GPU was actually used
|
||||||
# faster-whisper uses CTranslate2 which manages GPU memory separately
|
# faster-whisper uses CTranslate2 which manages GPU memory separately
|
||||||
@@ -156,6 +167,19 @@ def _check_gpu_health_internal(expected_device: str = "auto") -> GPUHealthStatus
|
|||||||
actual_device = "cpu"
|
actual_device = "cpu"
|
||||||
gpu_working = False
|
gpu_working = False
|
||||||
logger.error(f"GPU health check failed: {error_msg}")
|
logger.error(f"GPU health check failed: {error_msg}")
|
||||||
|
finally:
|
||||||
|
# Clean up model resources to prevent GPU memory leak
|
||||||
|
if model is not None:
|
||||||
|
try:
|
||||||
|
del model
|
||||||
|
segments_list = None
|
||||||
|
# Force garbage collection and empty CUDA cache
|
||||||
|
import gc
|
||||||
|
gc.collect()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
except Exception as cleanup_error:
|
||||||
|
logger.warning(f"Error cleaning up GPU health check model: {cleanup_error}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Health check setup failed: {str(e)}"
|
error_msg = f"Health check setup failed: {str(e)}"
|
||||||
|
|||||||
@@ -129,6 +129,9 @@ def transcribe_audio(
|
|||||||
logger.info("Using standard model for transcription...")
|
logger.info("Using standard model for transcription...")
|
||||||
segments, info = model_instance['model'].transcribe(audio_source, **options)
|
segments, info = model_instance['model'].transcribe(audio_source, **options)
|
||||||
|
|
||||||
|
# Convert segments generator to list to release model resources
|
||||||
|
segments = list(segments)
|
||||||
|
|
||||||
# Determine output directory and path early
|
# Determine output directory and path early
|
||||||
audio_dir = os.path.dirname(audio_path)
|
audio_dir = os.path.dirname(audio_path)
|
||||||
audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
||||||
@@ -253,6 +256,19 @@ def transcribe_audio(
|
|||||||
logger.error(f"Transcription failed: {str(e)}")
|
logger.error(f"Transcription failed: {str(e)}")
|
||||||
return f"Error occurred during transcription: {str(e)}"
|
return f"Error occurred during transcription: {str(e)}"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Force GPU memory cleanup after transcription to prevent accumulation
|
||||||
|
if device == "cuda":
|
||||||
|
import torch
|
||||||
|
import gc
|
||||||
|
# Clear segments list to free memory
|
||||||
|
segments = None
|
||||||
|
# Force garbage collection
|
||||||
|
gc.collect()
|
||||||
|
# Empty CUDA cache
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
logger.debug("GPU memory cleaned up after transcription")
|
||||||
|
|
||||||
|
|
||||||
def batch_transcribe(
|
def batch_transcribe(
|
||||||
audio_folder: str,
|
audio_folder: str,
|
||||||
|
|||||||
@@ -33,7 +33,8 @@ from utils.input_validation import (
|
|||||||
validate_model_name,
|
validate_model_name,
|
||||||
validate_device,
|
validate_device,
|
||||||
validate_compute_type,
|
validate_compute_type,
|
||||||
validate_output_format
|
validate_output_format,
|
||||||
|
validate_filename_safe
|
||||||
)
|
)
|
||||||
|
|
||||||
# Logging configuration
|
# Logging configuration
|
||||||
@@ -218,6 +219,8 @@ async def transcribe_upload(
|
|||||||
try:
|
try:
|
||||||
# Validate form parameters early
|
# Validate form parameters early
|
||||||
try:
|
try:
|
||||||
|
# Validate filename for security (basename-only, no path traversal)
|
||||||
|
validate_filename_safe(file.filename)
|
||||||
model = validate_model_name(model)
|
model = validate_model_name(model)
|
||||||
output_format = validate_output_format(output_format)
|
output_format = validate_output_format(output_format)
|
||||||
beam_size = validate_beam_size(beam_size)
|
beam_size = validate_beam_size(beam_size)
|
||||||
@@ -663,9 +666,11 @@ if __name__ == "__main__":
|
|||||||
# Perform startup GPU health check
|
# Perform startup GPU health check
|
||||||
from utils.startup import perform_startup_gpu_check
|
from utils.startup import perform_startup_gpu_check
|
||||||
|
|
||||||
|
# Disable auto_reset in Docker (sudo not available, GPU reset won't work)
|
||||||
|
in_docker = os.path.exists('/.dockerenv')
|
||||||
perform_startup_gpu_check(
|
perform_startup_gpu_check(
|
||||||
required_device="cuda",
|
required_device="cuda",
|
||||||
auto_reset=True,
|
auto_reset=not in_docker,
|
||||||
exit_on_failure=True
|
exit_on_failure=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,72 @@ def sanitize_error_message(error_msg: str, sanitize_paths: bool = True) -> str:
|
|||||||
return sanitized
|
return sanitized
|
||||||
|
|
||||||
|
|
||||||
|
def validate_filename_safe(filename: str) -> str:
|
||||||
|
"""
|
||||||
|
Validate uploaded filename for security (basename-only validation).
|
||||||
|
|
||||||
|
This function is specifically for validating uploaded filenames to ensure
|
||||||
|
they don't contain path traversal attempts. It enforces that the filename:
|
||||||
|
- Contains no directory separators (/, \)
|
||||||
|
- Has no path components (must be basename only)
|
||||||
|
- Contains no null bytes
|
||||||
|
- Has a valid audio file extension
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Filename to validate (should be basename only, not full path)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Validated filename (unchanged if valid)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValidationError: If filename is invalid or empty
|
||||||
|
PathTraversalError: If filename contains path components or traversal attempts
|
||||||
|
InvalidFileTypeError: If file extension is not allowed
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
validate_filename_safe("video.mp4") # ✓ PASS
|
||||||
|
validate_filename_safe("audio...mp3") # ✓ PASS (ellipsis OK)
|
||||||
|
validate_filename_safe("Wait... what.m4a") # ✓ PASS
|
||||||
|
validate_filename_safe("../../../etc/passwd") # ✗ FAIL (traversal)
|
||||||
|
validate_filename_safe("dir/file.mp4") # ✗ FAIL (path separator)
|
||||||
|
validate_filename_safe("/etc/passwd") # ✗ FAIL (absolute path)
|
||||||
|
"""
|
||||||
|
if not filename:
|
||||||
|
raise ValidationError("Filename cannot be empty")
|
||||||
|
|
||||||
|
# Check for null bytes
|
||||||
|
if "\x00" in filename:
|
||||||
|
logger.warning(f"Null byte in filename detected: {filename}")
|
||||||
|
raise PathTraversalError("Null bytes in filename are not allowed")
|
||||||
|
|
||||||
|
# Extract basename - if it differs from original, filename contained path components
|
||||||
|
basename = os.path.basename(filename)
|
||||||
|
if basename != filename:
|
||||||
|
logger.warning(f"Filename contains path components: {filename}")
|
||||||
|
raise PathTraversalError(
|
||||||
|
"Filename must not contain path components. "
|
||||||
|
f"Use only the filename: {basename}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Additional check: explicitly reject any path separators
|
||||||
|
if "/" in filename or "\\" in filename:
|
||||||
|
logger.warning(f"Path separators in filename: {filename}")
|
||||||
|
raise PathTraversalError("Path separators (/ or \\) are not allowed in filename")
|
||||||
|
|
||||||
|
# Check file extension (case-insensitive)
|
||||||
|
file_ext = Path(filename).suffix.lower()
|
||||||
|
if not file_ext:
|
||||||
|
raise InvalidFileTypeError("Filename must have a file extension")
|
||||||
|
|
||||||
|
if file_ext not in ALLOWED_AUDIO_EXTENSIONS:
|
||||||
|
raise InvalidFileTypeError(
|
||||||
|
f"Unsupported audio format: {file_ext}. "
|
||||||
|
f"Supported: {', '.join(sorted(ALLOWED_AUDIO_EXTENSIONS))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
def validate_path_safe(file_path: str, allowed_dirs: Optional[List[str]] = None) -> Path:
|
def validate_path_safe(file_path: str, allowed_dirs: Optional[List[str]] = None) -> Path:
|
||||||
"""
|
"""
|
||||||
Validate and sanitize a file path to prevent directory traversal attacks.
|
Validate and sanitize a file path to prevent directory traversal attacks.
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
[program:transcriptor-api]
|
|
||||||
command=/home/uad/agents/tools/mcp-transcriptor/venv/bin/python /home/uad/agents/tools/mcp-transcriptor/src/servers/api_server.py
|
|
||||||
directory=/home/uad/agents/tools/mcp-transcriptor
|
|
||||||
user=uad
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
redirect_stderr=true
|
|
||||||
stdout_logfile=/home/uad/agents/tools/mcp-transcriptor/logs/transcriptor-api.log
|
|
||||||
stdout_logfile_maxbytes=50MB
|
|
||||||
stdout_logfile_backups=10
|
|
||||||
environment=
|
|
||||||
PYTHONPATH="/home/uad/agents/tools/mcp-transcriptor/src",
|
|
||||||
CUDA_VISIBLE_DEVICES="0",
|
|
||||||
API_HOST="0.0.0.0",
|
|
||||||
API_PORT="33767",
|
|
||||||
WHISPER_MODEL_DIR="/home/uad/agents/tools/mcp-transcriptor/models",
|
|
||||||
TRANSCRIPTION_OUTPUT_DIR="/home/uad/agents/tools/mcp-transcriptor/outputs",
|
|
||||||
TRANSCRIPTION_BATCH_OUTPUT_DIR="/home/uad/agents/tools/mcp-transcriptor/outputs/batch",
|
|
||||||
TRANSCRIPTION_MODEL="large-v3",
|
|
||||||
TRANSCRIPTION_DEVICE="auto",
|
|
||||||
TRANSCRIPTION_COMPUTE_TYPE="auto",
|
|
||||||
TRANSCRIPTION_OUTPUT_FORMAT="txt",
|
|
||||||
HTTP_PROXY=http://192.168.1.212:8080,
|
|
||||||
HTTPS_PROXY=http://192.168.1.212:8080
|
|
||||||
stopwaitsecs=10
|
|
||||||
stopsignal=TERM
|
|
||||||
60
test_filename_fix.py
Normal file
60
test_filename_fix.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick manual test to verify the filename validation fix.
|
||||||
|
Tests the exact case from the bug report.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, 'src')
|
||||||
|
|
||||||
|
from utils.input_validation import validate_filename_safe, PathTraversalError
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("FILENAME VALIDATION FIX - MANUAL TEST")
|
||||||
|
print("="*70 + "\n")
|
||||||
|
|
||||||
|
# Bug report case
|
||||||
|
bug_report_filename = "This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a"
|
||||||
|
|
||||||
|
print(f"Testing bug report filename:")
|
||||||
|
print(f" '{bug_report_filename}'")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = validate_filename_safe(bug_report_filename)
|
||||||
|
print(f"✅ SUCCESS: Filename accepted!")
|
||||||
|
print(f" Returned: '{result}'")
|
||||||
|
except PathTraversalError as e:
|
||||||
|
print(f"❌ FAILED: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ ERROR: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Test that security still works
|
||||||
|
print("Verifying security (path traversal should still be blocked):")
|
||||||
|
dangerous_filenames = [
|
||||||
|
"../../../etc/passwd",
|
||||||
|
"../../secrets.txt",
|
||||||
|
"dir/file.m4a",
|
||||||
|
]
|
||||||
|
|
||||||
|
for dangerous in dangerous_filenames:
|
||||||
|
try:
|
||||||
|
validate_filename_safe(dangerous)
|
||||||
|
print(f"❌ SECURITY ISSUE: '{dangerous}' was accepted (should be blocked!)")
|
||||||
|
sys.exit(1)
|
||||||
|
except PathTraversalError:
|
||||||
|
print(f"✅ '{dangerous}' correctly blocked")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("="*70)
|
||||||
|
print("ALL TESTS PASSED! ✅")
|
||||||
|
print("="*70)
|
||||||
|
print()
|
||||||
|
print("The fix is working correctly:")
|
||||||
|
print(" ✓ Filenames with ellipsis (...) are now accepted")
|
||||||
|
print(" ✓ Path traversal attacks are still blocked")
|
||||||
|
print()
|
||||||
281
tests/test_input_validation.py
Normal file
281
tests/test_input_validation.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests for input validation module, specifically filename validation.
|
||||||
|
|
||||||
|
Tests the security-critical validate_filename_safe() function to ensure
|
||||||
|
it correctly blocks path traversal attacks while allowing legitimate filenames.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Add src to path (go up one level from tests/ to root)
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||||
|
|
||||||
|
from src.utils.input_validation import (
|
||||||
|
validate_filename_safe,
|
||||||
|
ValidationError,
|
||||||
|
PathTraversalError,
|
||||||
|
InvalidFileTypeError,
|
||||||
|
ALLOWED_AUDIO_EXTENSIONS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidFilenameSafe:
|
||||||
|
"""Test validate_filename_safe() function with various inputs."""
|
||||||
|
|
||||||
|
def test_simple_valid_filenames(self):
|
||||||
|
"""Test that simple, valid filenames are accepted."""
|
||||||
|
valid_names = [
|
||||||
|
"audio.m4a",
|
||||||
|
"song.wav",
|
||||||
|
"podcast.mp3",
|
||||||
|
"recording.flac",
|
||||||
|
"music.ogg",
|
||||||
|
"voice.aac",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in valid_names:
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should accept: {filename}"
|
||||||
|
|
||||||
|
def test_filenames_with_ellipsis(self):
|
||||||
|
"""Test filenames with ellipsis (multiple dots) are accepted."""
|
||||||
|
# This is the key test case from the bug report
|
||||||
|
ellipsis_names = [
|
||||||
|
"audio...mp3",
|
||||||
|
"This is... a test.m4a",
|
||||||
|
"Part 1... Part 2.wav",
|
||||||
|
"Wait... what.m4a",
|
||||||
|
"video...multiple...dots.mp3",
|
||||||
|
"This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a", # Bug report case
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in ellipsis_names:
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should accept filename with ellipsis: {filename}"
|
||||||
|
|
||||||
|
def test_filenames_with_special_chars(self):
|
||||||
|
"""Test filenames with various special characters."""
|
||||||
|
special_char_names = [
|
||||||
|
"My-Video_2024.m4a",
|
||||||
|
"song (remix).m4a",
|
||||||
|
"audio [final].wav",
|
||||||
|
"test file with spaces.mp3",
|
||||||
|
"file-name_with-symbols.flac",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in special_char_names:
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should accept: {filename}"
|
||||||
|
|
||||||
|
def test_multiple_extensions(self):
|
||||||
|
"""Test filenames that look like they have multiple extensions."""
|
||||||
|
multi_ext_names = [
|
||||||
|
"backup.tar.gz.mp3", # .mp3 is valid
|
||||||
|
"file.old.wav", # .wav is valid
|
||||||
|
"audio.2024.m4a", # .m4a is valid
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in multi_ext_names:
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should accept: {filename}"
|
||||||
|
|
||||||
|
def test_path_traversal_attempts(self):
|
||||||
|
"""Test that path traversal attempts are rejected."""
|
||||||
|
dangerous_names = [
|
||||||
|
"../../../etc/passwd",
|
||||||
|
"../../secrets.txt",
|
||||||
|
"../file.mp4",
|
||||||
|
"dir/../file.mp4",
|
||||||
|
"file/../../etc/passwd",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in dangerous_names:
|
||||||
|
with pytest.raises(PathTraversalError) as exc_info:
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
assert "path" in str(exc_info.value).lower(), f"Should reject path traversal: {filename}"
|
||||||
|
|
||||||
|
def test_absolute_paths(self):
|
||||||
|
"""Test that absolute paths are rejected."""
|
||||||
|
absolute_paths = [
|
||||||
|
"/etc/passwd",
|
||||||
|
"/tmp/file.mp4",
|
||||||
|
"/home/user/audio.wav",
|
||||||
|
"C:\\Windows\\System32\\file.mp3", # Windows path
|
||||||
|
"\\\\server\\share\\file.m4a", # UNC path
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in absolute_paths:
|
||||||
|
with pytest.raises(PathTraversalError) as exc_info:
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
assert "path" in str(exc_info.value).lower(), f"Should reject absolute path: {filename}"
|
||||||
|
|
||||||
|
def test_path_separators(self):
|
||||||
|
"""Test that filenames with path separators are rejected."""
|
||||||
|
paths_with_separators = [
|
||||||
|
"dir/file.mp4",
|
||||||
|
"folder\\file.wav",
|
||||||
|
"path/to/audio.m4a",
|
||||||
|
"a/b/c/d.mp3",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in paths_with_separators:
|
||||||
|
with pytest.raises(PathTraversalError) as exc_info:
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
assert "separator" in str(exc_info.value).lower() or "path" in str(exc_info.value).lower(), \
|
||||||
|
f"Should reject path with separators: {filename}"
|
||||||
|
|
||||||
|
def test_null_bytes(self):
|
||||||
|
"""Test that filenames with null bytes are rejected."""
|
||||||
|
null_byte_names = [
|
||||||
|
"file\x00.mp4",
|
||||||
|
"\x00malicious.wav",
|
||||||
|
"audio\x00evil.m4a",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in null_byte_names:
|
||||||
|
with pytest.raises(PathTraversalError) as exc_info:
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
assert "null" in str(exc_info.value).lower(), f"Should reject null bytes: {repr(filename)}"
|
||||||
|
|
||||||
|
def test_empty_filename(self):
|
||||||
|
"""Test that empty filename is rejected."""
|
||||||
|
with pytest.raises(ValidationError) as exc_info:
|
||||||
|
validate_filename_safe("")
|
||||||
|
assert "empty" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
def test_no_extension(self):
|
||||||
|
"""Test that filenames without extensions are rejected."""
|
||||||
|
no_ext_names = [
|
||||||
|
"filename",
|
||||||
|
"noextension",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in no_ext_names:
|
||||||
|
with pytest.raises(InvalidFileTypeError) as exc_info:
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
assert "extension" in str(exc_info.value).lower(), f"Should reject no extension: {filename}"
|
||||||
|
|
||||||
|
def test_invalid_extensions(self):
|
||||||
|
"""Test that unsupported file extensions are rejected."""
|
||||||
|
invalid_ext_names = [
|
||||||
|
"document.pdf",
|
||||||
|
"image.png",
|
||||||
|
"video.avi",
|
||||||
|
"script.sh",
|
||||||
|
"executable.exe",
|
||||||
|
"text.txt",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in invalid_ext_names:
|
||||||
|
with pytest.raises(InvalidFileTypeError) as exc_info:
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
assert "unsupported" in str(exc_info.value).lower() or "format" in str(exc_info.value).lower(), \
|
||||||
|
f"Should reject invalid extension: {filename}"
|
||||||
|
|
||||||
|
def test_case_insensitive_extensions(self):
|
||||||
|
"""Test that file extensions are case-insensitive."""
|
||||||
|
case_variations = [
|
||||||
|
"audio.MP3",
|
||||||
|
"sound.WAV",
|
||||||
|
"music.M4A",
|
||||||
|
"podcast.FLAC",
|
||||||
|
"voice.AAC",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in case_variations:
|
||||||
|
# Should not raise exception
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should accept case variation: {filename}"
|
||||||
|
|
||||||
|
def test_edge_cases(self):
|
||||||
|
"""Test various edge cases."""
|
||||||
|
# Just dots (but with valid extension) - should pass
|
||||||
|
assert validate_filename_safe("...mp3") == "...mp3"
|
||||||
|
assert validate_filename_safe("....wav") == "....wav"
|
||||||
|
|
||||||
|
# Filenames starting with dot (hidden files on Unix)
|
||||||
|
assert validate_filename_safe(".hidden.m4a") == ".hidden.m4a"
|
||||||
|
|
||||||
|
# Very long filename (but valid)
|
||||||
|
long_name = "a" * 200 + ".mp3"
|
||||||
|
assert validate_filename_safe(long_name) == long_name
|
||||||
|
|
||||||
|
def test_allowed_extensions_comprehensive(self):
|
||||||
|
"""Test all allowed extensions from ALLOWED_AUDIO_EXTENSIONS."""
|
||||||
|
for ext in ALLOWED_AUDIO_EXTENSIONS:
|
||||||
|
filename = f"test{ext}"
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should accept allowed extension: {ext}"
|
||||||
|
|
||||||
|
|
||||||
|
class TestBugReportCase:
|
||||||
|
"""Specific test for the bug report case."""
|
||||||
|
|
||||||
|
def test_bug_report_filename(self):
|
||||||
|
"""
|
||||||
|
Test the exact filename from the bug report that was failing.
|
||||||
|
|
||||||
|
Bug: "This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a"
|
||||||
|
was being rejected due to "..." being parsed as ".."
|
||||||
|
"""
|
||||||
|
filename = "This Weird FPV Drone only takes one kind of Battery... Rekon 35 V2.m4a"
|
||||||
|
|
||||||
|
# Should NOT raise any exception
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename
|
||||||
|
|
||||||
|
def test_various_ellipsis_patterns(self):
|
||||||
|
"""Test various ellipsis patterns that should all be accepted."""
|
||||||
|
patterns = [
|
||||||
|
"...", # Three dots
|
||||||
|
"....", # Four dots
|
||||||
|
".....", # Five dots
|
||||||
|
"file...end.mp3",
|
||||||
|
"start...middle...end.wav",
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
if not pattern.endswith(tuple(f"{ext}" for ext in ALLOWED_AUDIO_EXTENSIONS)):
|
||||||
|
pattern += ".mp3" # Add valid extension
|
||||||
|
result = validate_filename_safe(pattern)
|
||||||
|
assert result == pattern
|
||||||
|
|
||||||
|
|
||||||
|
class TestSecurityBoundary:
|
||||||
|
"""Test the security boundary between safe and dangerous filenames."""
|
||||||
|
|
||||||
|
def test_just_two_dots_vs_path_separator(self):
|
||||||
|
"""
|
||||||
|
Test the critical distinction:
|
||||||
|
- "file..mp3" (two dots in filename) = SAFE
|
||||||
|
- "../file.mp3" (two dots as path component) = DANGEROUS
|
||||||
|
"""
|
||||||
|
# Safe: dots within filename
|
||||||
|
safe_filenames = [
|
||||||
|
"file..mp3",
|
||||||
|
"..file.mp3",
|
||||||
|
"file...mp3",
|
||||||
|
"f..i..l..e.mp3",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in safe_filenames:
|
||||||
|
result = validate_filename_safe(filename)
|
||||||
|
assert result == filename, f"Should be safe: {filename}"
|
||||||
|
|
||||||
|
# Dangerous: dots as directory reference
|
||||||
|
dangerous_filenames = [
|
||||||
|
"../file.mp3",
|
||||||
|
"../../file.mp3",
|
||||||
|
"dir/../file.mp3",
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename in dangerous_filenames:
|
||||||
|
with pytest.raises(PathTraversalError):
|
||||||
|
validate_filename_safe(filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Reference in New Issue
Block a user