transcription flow cilalama, bugfixes
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -14,4 +14,6 @@ venv/
|
|||||||
# Cython
|
# Cython
|
||||||
*.pyd
|
*.pyd
|
||||||
|
|
||||||
|
logs/**
|
||||||
|
User/**
|
||||||
|
data/**
|
||||||
|
|||||||
6
mcp.logs
Normal file
6
mcp.logs
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{"jsonrpc":"2.0","id":1,"result":{"protocolVersion":"2025-03-26","capabilities":{"experimental":{},"prompts":{"listChanged":false},"resources":{"subscribe":false,"listChanged":false},"tools":{"listChanged":false}},"serverInfo":{"name":"fast-whisper-mcp-server","version":"1.9.4"}}}
|
||||||
|
INFO:mcp.server.lowlevel.server:Processing request of type ListToolsRequest
|
||||||
|
{"jsonrpc":"2.0","id":2,"result":{"tools":[{"name":"get_model_info_api","description":"\n Get available Whisper model information\n ","inputSchema":{"properties":{},"title":"get_model_info_apiArguments","type":"object"}},{"name":"transcribe","description":"\n Transcribe audio files using Faster Whisper\n\n Args:\n audio_path: Path to the audio file\n model_name: Model name (tiny, base, small, medium, large-v1, large-v2, large-v3)\n device: Execution device (cpu, cuda, auto)\n compute_type: Computation type (float16, int8, auto)\n language: Language code (such as zh, en, ja, etc., auto-detect by default)\n output_format: Output format (vtt, srt, json or txt)\n beam_size: Beam search size, larger values may improve accuracy but reduce speed\n temperature: Sampling temperature, greedy decoding\n initial_prompt: Initial prompt text, can help the model better understand context\n output_directory: Output directory path, defaults to the audio file's directory\n\n Returns:\n str: Transcription result, in VTT subtitle or JSON format\n ","inputSchema":{"properties":{"audio_path":{"title":"Audio Path","type":"string"},"model_name":{"default":"large-v3","title":"Model Name","type":"string"},"device":{"default":"auto","title":"Device","type":"string"},"compute_type":{"default":"auto","title":"Compute Type","type":"string"},"language":{"default":null,"title":"Language","type":"string"},"output_format":{"default":"vtt","title":"Output Format","type":"string"},"beam_size":{"default":5,"title":"Beam Size","type":"integer"},"temperature":{"default":0.0,"title":"Temperature","type":"number"},"initial_prompt":{"default":null,"title":"Initial Prompt","type":"string"},"output_directory":{"default":null,"title":"Output Directory","type":"string"}},"required":["audio_path"],"title":"transcribeArguments","type":"object"}},{"name":"batch_transcribe_audio","description":"\n Batch transcribe audio files in a folder\n\n Args:\n audio_folder: Path to the folder containing audio files\n output_folder: Output folder path, defaults to a 'transcript' subfolder in audio_folder\n model_name: Model name (tiny, base, small, medium, large-v1, large-v2, large-v3)\n device: Execution device (cpu, cuda, auto)\n compute_type: Computation type (float16, int8, auto)\n language: Language code (such as zh, en, ja, etc., auto-detect by default)\n output_format: Output format (vtt, srt, json or txt)\n beam_size: Beam search size, larger values may improve accuracy but reduce speed\n temperature: Sampling temperature, 0 means greedy decoding\n initial_prompt: Initial prompt text, can help the model better understand context\n parallel_files: Number of files to process in parallel (only effective in CPU mode)\n\n Returns:\n str: Batch processing summary, including processing time and success rate\n ","inputSchema":{"properties":{"audio_folder":{"title":"Audio Folder","type":"string"},"output_folder":{"default":null,"title":"Output Folder","type":"string"},"model_name":{"default":"large-v3","title":"Model Name","type":"string"},"device":{"default":"auto","title":"Device","type":"string"},"compute_type":{"default":"auto","title":"Compute Type","type":"string"},"language":{"default":null,"title":"Language","type":"string"},"output_format":{"default":"vtt","title":"Output Format","type":"string"},"beam_size":{"default":5,"title":"Beam Size","type":"integer"},"temperature":{"default":0.0,"title":"Temperature","type":"number"},"initial_prompt":{"default":null,"title":"Initial Prompt","type":"string"},"parallel_files":{"default":1,"title":"Parallel Files","type":"integer"}},"required":["audio_folder"],"title":"batch_transcribe_audioArguments","type":"object"}}]}}
|
||||||
|
INFO:mcp.server.lowlevel.server:Processing request of type CallToolRequest
|
||||||
|
INFO:model_manager:GPU test passed: NVIDIA GeForce RTX 3060 (12.5GB)
|
||||||
|
INFO:model_manager:Loading Whisper model: large-v3 device: cuda compute type: float16
|
||||||
@@ -4,7 +4,7 @@ Model Management Module
|
|||||||
Responsible for loading, caching, and managing Whisper models
|
Responsible for loading, caching, and managing Whisper models
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os; print(os.environ.get("WHISPER_MODEL_DIR"))
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
@@ -17,6 +17,30 @@ logger = logging.getLogger(__name__)
|
|||||||
# Global model instance cache
|
# Global model instance cache
|
||||||
model_instances = {}
|
model_instances = {}
|
||||||
|
|
||||||
|
def test_gpu_driver():
|
||||||
|
"""Simple GPU driver test"""
|
||||||
|
try:
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
logger.error("CUDA not available in PyTorch")
|
||||||
|
raise RuntimeError("CUDA not available")
|
||||||
|
|
||||||
|
gpu_count = torch.cuda.device_count()
|
||||||
|
if gpu_count == 0:
|
||||||
|
logger.error("No CUDA devices found")
|
||||||
|
raise RuntimeError("No CUDA devices")
|
||||||
|
|
||||||
|
# Quick GPU test
|
||||||
|
test_tensor = torch.randn(10, 10).cuda()
|
||||||
|
_ = test_tensor @ test_tensor.T
|
||||||
|
|
||||||
|
device_name = torch.cuda.get_device_name(0)
|
||||||
|
memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||||
|
logger.info(f"GPU test passed: {device_name} ({memory_gb:.1f}GB)")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"GPU test failed: {e}")
|
||||||
|
raise RuntimeError(f"GPU initialization failed: {e}")
|
||||||
|
|
||||||
def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[str, Any]:
|
def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Get or create Whisper model instance
|
Get or create Whisper model instance
|
||||||
@@ -46,9 +70,8 @@ def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[s
|
|||||||
raise ValueError(f"Invalid device: {device}. Valid devices: cpu, cuda")
|
raise ValueError(f"Invalid device: {device}. Valid devices: cpu, cuda")
|
||||||
|
|
||||||
if device == "cuda" and not torch.cuda.is_available():
|
if device == "cuda" and not torch.cuda.is_available():
|
||||||
logger.warning("CUDA not available, automatically switching to CPU")
|
logger.error("CUDA requested but not available")
|
||||||
device = "cpu"
|
raise RuntimeError("CUDA not available but explicitly requested")
|
||||||
compute_type = "int8"
|
|
||||||
|
|
||||||
if compute_type not in ["float16", "int8"]:
|
if compute_type not in ["float16", "int8"]:
|
||||||
raise ValueError(f"Invalid compute type: {compute_type}. Valid compute types: float16, int8")
|
raise ValueError(f"Invalid compute type: {compute_type}. Valid compute types: float16, int8")
|
||||||
@@ -65,8 +88,9 @@ def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[s
|
|||||||
logger.info(f"Using cached model instance: {model_key}")
|
logger.info(f"Using cached model instance: {model_key}")
|
||||||
return model_instances[model_key]
|
return model_instances[model_key]
|
||||||
|
|
||||||
# Clean GPU memory (if using CUDA)
|
# Test GPU driver before loading model and clean
|
||||||
if device == "cuda":
|
if device == "cuda":
|
||||||
|
test_gpu_driver()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
# Instantiate model
|
# Instantiate model
|
||||||
@@ -173,4 +197,4 @@ def get_model_info() -> str:
|
|||||||
"memory_available": f"{torch.cuda.get_device_properties(0).total_memory / 1e9 - torch.cuda.memory_allocated() / 1e9:.2f} GB"
|
"memory_available": f"{torch.cuda.get_device_properties(0).total_memory / 1e9 - torch.cuda.memory_allocated() / 1e9:.2f} GB"
|
||||||
}
|
}
|
||||||
|
|
||||||
return json.dumps(info, indent=2)
|
return json.dumps(info, indent=2)
|
||||||
|
|||||||
@@ -10,12 +10,13 @@ USER_ID=$(id -u)
|
|||||||
GROUP_ID=$(id -g)
|
GROUP_ID=$(id -g)
|
||||||
|
|
||||||
# Set environment variables
|
# Set environment variables
|
||||||
|
export CUDA_VISIBLE_DEVICES=1
|
||||||
export WHISPER_MODEL_DIR="/home/uad/agents/tools/mcp-transcriptor/data/models"
|
export WHISPER_MODEL_DIR="/home/uad/agents/tools/mcp-transcriptor/data/models"
|
||||||
export TRANSCRIPTION_OUTPUT_DIR="/home/uad/agents/tools/mcp-transcriptor/data/transcripts"
|
export TRANSCRIPTION_OUTPUT_DIR="/media/raid/agents/tools/mcp-transcriptor/outputs"
|
||||||
export TRANSCRIPTION_BATCH_OUTPUT_DIR="/home/uad/agents/tools/mcp-transcriptor/data/transcripts/batch"
|
export TRANSCRIPTION_BATCH_OUTPUT_DIR="/media/raid/agents/tools/mcp-transcriptor/outputs/batch"
|
||||||
export TRANSCRIPTION_MODEL="base"
|
export TRANSCRIPTION_MODEL="large-v3"
|
||||||
export TRANSCRIPTION_DEVICE="auto"
|
export TRANSCRIPTION_DEVICE="cuda"
|
||||||
export TRANSCRIPTION_COMPUTE_TYPE="auto"
|
export TRANSCRIPTION_COMPUTE_TYPE="cuda"
|
||||||
export TRANSCRIPTION_OUTPUT_FORMAT="txt"
|
export TRANSCRIPTION_OUTPUT_FORMAT="txt"
|
||||||
export TRANSCRIPTION_BEAM_SIZE="2"
|
export TRANSCRIPTION_BEAM_SIZE="2"
|
||||||
export TRANSCRIPTION_TEMPERATURE="0.0"
|
export TRANSCRIPTION_TEMPERATURE="0.0"
|
||||||
@@ -24,6 +25,7 @@ export TRANSCRIPTION_FILENAME_PREFIX="test_"
|
|||||||
|
|
||||||
# Log start of the script
|
# Log start of the script
|
||||||
echo "$(datetime_prefix) Starting whisper server script..."
|
echo "$(datetime_prefix) Starting whisper server script..."
|
||||||
|
echo "test: $WHISPER_MODEL_DIR"
|
||||||
|
|
||||||
# Optional: Verify required directories exist
|
# Optional: Verify required directories exist
|
||||||
if [ ! -d "$WHISPER_MODEL_DIR" ]; then
|
if [ ! -d "$WHISPER_MODEL_DIR" ]; then
|
||||||
@@ -32,5 +34,4 @@ if [ ! -d "$WHISPER_MODEL_DIR" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Run the Python script with the defined environment variables
|
# Run the Python script with the defined environment variables
|
||||||
sudo /home/uad/agents/tools/mcp-transcriptor/venv/bin/python \
|
/home/uad/agents/tools/mcp-transcriptor/venv/bin/python /home/uad/agents/tools/mcp-transcriptor/whisper_server.py 2>&1 | tee /home/uad/agents/tools/mcp-transcriptor/mcp.logs
|
||||||
/home/uad/agents/tools/mcp-transcriptor/whisper_server.py
|
|
||||||
|
|||||||
@@ -19,9 +19,9 @@ logger = logging.getLogger(__name__)
|
|||||||
# Environment variable defaults
|
# Environment variable defaults
|
||||||
DEFAULT_OUTPUT_DIR = os.getenv('TRANSCRIPTION_OUTPUT_DIR', None)
|
DEFAULT_OUTPUT_DIR = os.getenv('TRANSCRIPTION_OUTPUT_DIR', None)
|
||||||
DEFAULT_BATCH_OUTPUT_DIR = os.getenv('TRANSCRIPTION_BATCH_OUTPUT_DIR', None)
|
DEFAULT_BATCH_OUTPUT_DIR = os.getenv('TRANSCRIPTION_BATCH_OUTPUT_DIR', None)
|
||||||
DEFAULT_MODEL = os.getenv('TRANSCRIPTION_MODEL', 'large-v3')
|
DEFAULT_MODEL = os.getenv('TRANSCRIPTION_MODEL', 'base')
|
||||||
DEFAULT_DEVICE = os.getenv('TRANSCRIPTION_DEVICE', 'auto')
|
DEFAULT_DEVICE = os.getenv('TRANSCRIPTION_DEVICE', 'cuda')
|
||||||
DEFAULT_COMPUTE_TYPE = os.getenv('TRANSCRIPTION_COMPUTE_TYPE', 'auto')
|
DEFAULT_COMPUTE_TYPE = os.getenv('TRANSCRIPTION_COMPUTE_TYPE', 'base')
|
||||||
DEFAULT_LANGUAGE = os.getenv('TRANSCRIPTION_LANGUAGE', None)
|
DEFAULT_LANGUAGE = os.getenv('TRANSCRIPTION_LANGUAGE', None)
|
||||||
DEFAULT_OUTPUT_FORMAT = os.getenv('TRANSCRIPTION_OUTPUT_FORMAT', 'txt')
|
DEFAULT_OUTPUT_FORMAT = os.getenv('TRANSCRIPTION_OUTPUT_FORMAT', 'txt')
|
||||||
DEFAULT_BEAM_SIZE = int(os.getenv('TRANSCRIPTION_BEAM_SIZE', '5'))
|
DEFAULT_BEAM_SIZE = int(os.getenv('TRANSCRIPTION_BEAM_SIZE', '5'))
|
||||||
@@ -31,7 +31,7 @@ DEFAULT_TEMPERATURE = float(os.getenv('TRANSCRIPTION_TEMPERATURE', '0.0'))
|
|||||||
WHISPER_MODEL_DIR = os.getenv('WHISPER_MODEL_DIR', None)
|
WHISPER_MODEL_DIR = os.getenv('WHISPER_MODEL_DIR', None)
|
||||||
|
|
||||||
# File naming configuration
|
# File naming configuration
|
||||||
USE_TIMESTAMP = os.getenv('TRANSCRIPTION_USE_TIMESTAMP', 'true').lower() == 'true'
|
USE_TIMESTAMP = os.getenv('TRANSCRIPTION_USE_TIMESTAMP', 'false').lower() == 'true'
|
||||||
FILENAME_PREFIX = os.getenv('TRANSCRIPTION_FILENAME_PREFIX', '')
|
FILENAME_PREFIX = os.getenv('TRANSCRIPTION_FILENAME_PREFIX', '')
|
||||||
FILENAME_SUFFIX = os.getenv('TRANSCRIPTION_FILENAME_SUFFIX', '')
|
FILENAME_SUFFIX = os.getenv('TRANSCRIPTION_FILENAME_SUFFIX', '')
|
||||||
|
|
||||||
@@ -97,6 +97,7 @@ def transcribe_audio(
|
|||||||
|
|
||||||
# Set transcription parameters
|
# Set transcription parameters
|
||||||
options = {
|
options = {
|
||||||
|
"verbose": True,
|
||||||
"language": language,
|
"language": language,
|
||||||
"vad_filter": True,
|
"vad_filter": True,
|
||||||
"vad_parameters": {"min_silence_duration_ms": 500},
|
"vad_parameters": {"min_silence_duration_ms": 500},
|
||||||
@@ -106,7 +107,7 @@ def transcribe_audio(
|
|||||||
"word_timestamps": True,
|
"word_timestamps": True,
|
||||||
"suppress_tokens": [-1],
|
"suppress_tokens": [-1],
|
||||||
"condition_on_previous_text": True,
|
"condition_on_previous_text": True,
|
||||||
"compression_ratio_threshold": 2.4
|
"compression_ratio_threshold": 2.4,
|
||||||
}
|
}
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|||||||
Reference in New Issue
Block a user