From 046204d5557f28e302708cbcb5f61f4fe5f23b50 Mon Sep 17 00:00:00 2001 From: Alihan Date: Sun, 15 Jun 2025 17:50:05 +0300 Subject: [PATCH] transcription flow cilalama, bugfixes --- .gitignore | 4 +++- data | 1 + mcp.logs | 6 ++++++ model_manager.py | 36 ++++++++++++++++++++++++++++++------ run_server.sh | 15 ++++++++------- transcriber.py | 11 ++++++----- 6 files changed, 54 insertions(+), 19 deletions(-) create mode 120000 data create mode 100644 mcp.logs diff --git a/.gitignore b/.gitignore index 4e736a6..f6a132e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,6 @@ venv/ # Cython *.pyd - +logs/** +User/** +data/** diff --git a/data b/data new file mode 120000 index 0000000..409ac74 --- /dev/null +++ b/data @@ -0,0 +1 @@ +/media/raid/agents/tools/mcp-transcriptor \ No newline at end of file diff --git a/mcp.logs b/mcp.logs new file mode 100644 index 0000000..88f960a --- /dev/null +++ b/mcp.logs @@ -0,0 +1,6 @@ +{"jsonrpc":"2.0","id":1,"result":{"protocolVersion":"2025-03-26","capabilities":{"experimental":{},"prompts":{"listChanged":false},"resources":{"subscribe":false,"listChanged":false},"tools":{"listChanged":false}},"serverInfo":{"name":"fast-whisper-mcp-server","version":"1.9.4"}}} +INFO:mcp.server.lowlevel.server:Processing request of type ListToolsRequest +{"jsonrpc":"2.0","id":2,"result":{"tools":[{"name":"get_model_info_api","description":"\n Get available Whisper model information\n ","inputSchema":{"properties":{},"title":"get_model_info_apiArguments","type":"object"}},{"name":"transcribe","description":"\n Transcribe audio files using Faster Whisper\n\n Args:\n audio_path: Path to the audio file\n model_name: Model name (tiny, base, small, medium, large-v1, large-v2, large-v3)\n device: Execution device (cpu, cuda, auto)\n compute_type: Computation type (float16, int8, auto)\n language: Language code (such as zh, en, ja, etc., auto-detect by default)\n output_format: Output format (vtt, srt, json or txt)\n beam_size: Beam search size, larger values may improve accuracy but reduce speed\n temperature: Sampling temperature, greedy decoding\n initial_prompt: Initial prompt text, can help the model better understand context\n output_directory: Output directory path, defaults to the audio file's directory\n\n Returns:\n str: Transcription result, in VTT subtitle or JSON format\n ","inputSchema":{"properties":{"audio_path":{"title":"Audio Path","type":"string"},"model_name":{"default":"large-v3","title":"Model Name","type":"string"},"device":{"default":"auto","title":"Device","type":"string"},"compute_type":{"default":"auto","title":"Compute Type","type":"string"},"language":{"default":null,"title":"Language","type":"string"},"output_format":{"default":"vtt","title":"Output Format","type":"string"},"beam_size":{"default":5,"title":"Beam Size","type":"integer"},"temperature":{"default":0.0,"title":"Temperature","type":"number"},"initial_prompt":{"default":null,"title":"Initial Prompt","type":"string"},"output_directory":{"default":null,"title":"Output Directory","type":"string"}},"required":["audio_path"],"title":"transcribeArguments","type":"object"}},{"name":"batch_transcribe_audio","description":"\n Batch transcribe audio files in a folder\n\n Args:\n audio_folder: Path to the folder containing audio files\n output_folder: Output folder path, defaults to a 'transcript' subfolder in audio_folder\n model_name: Model name (tiny, base, small, medium, large-v1, large-v2, large-v3)\n device: Execution device (cpu, cuda, auto)\n compute_type: Computation type (float16, int8, auto)\n language: Language code (such as zh, en, ja, etc., auto-detect by default)\n output_format: Output format (vtt, srt, json or txt)\n beam_size: Beam search size, larger values may improve accuracy but reduce speed\n temperature: Sampling temperature, 0 means greedy decoding\n initial_prompt: Initial prompt text, can help the model better understand context\n parallel_files: Number of files to process in parallel (only effective in CPU mode)\n\n Returns:\n str: Batch processing summary, including processing time and success rate\n ","inputSchema":{"properties":{"audio_folder":{"title":"Audio Folder","type":"string"},"output_folder":{"default":null,"title":"Output Folder","type":"string"},"model_name":{"default":"large-v3","title":"Model Name","type":"string"},"device":{"default":"auto","title":"Device","type":"string"},"compute_type":{"default":"auto","title":"Compute Type","type":"string"},"language":{"default":null,"title":"Language","type":"string"},"output_format":{"default":"vtt","title":"Output Format","type":"string"},"beam_size":{"default":5,"title":"Beam Size","type":"integer"},"temperature":{"default":0.0,"title":"Temperature","type":"number"},"initial_prompt":{"default":null,"title":"Initial Prompt","type":"string"},"parallel_files":{"default":1,"title":"Parallel Files","type":"integer"}},"required":["audio_folder"],"title":"batch_transcribe_audioArguments","type":"object"}}]}} +INFO:mcp.server.lowlevel.server:Processing request of type CallToolRequest +INFO:model_manager:GPU test passed: NVIDIA GeForce RTX 3060 (12.5GB) +INFO:model_manager:Loading Whisper model: large-v3 device: cuda compute type: float16 diff --git a/model_manager.py b/model_manager.py index af6695f..5c781b8 100644 --- a/model_manager.py +++ b/model_manager.py @@ -4,7 +4,7 @@ Model Management Module Responsible for loading, caching, and managing Whisper models """ -import os +import os; print(os.environ.get("WHISPER_MODEL_DIR")) import time import logging from typing import Dict, Any @@ -17,6 +17,30 @@ logger = logging.getLogger(__name__) # Global model instance cache model_instances = {} +def test_gpu_driver(): + """Simple GPU driver test""" + try: + if not torch.cuda.is_available(): + logger.error("CUDA not available in PyTorch") + raise RuntimeError("CUDA not available") + + gpu_count = torch.cuda.device_count() + if gpu_count == 0: + logger.error("No CUDA devices found") + raise RuntimeError("No CUDA devices") + + # Quick GPU test + test_tensor = torch.randn(10, 10).cuda() + _ = test_tensor @ test_tensor.T + + device_name = torch.cuda.get_device_name(0) + memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 + logger.info(f"GPU test passed: {device_name} ({memory_gb:.1f}GB)") + + except Exception as e: + logger.error(f"GPU test failed: {e}") + raise RuntimeError(f"GPU initialization failed: {e}") + def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[str, Any]: """ Get or create Whisper model instance @@ -46,9 +70,8 @@ def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[s raise ValueError(f"Invalid device: {device}. Valid devices: cpu, cuda") if device == "cuda" and not torch.cuda.is_available(): - logger.warning("CUDA not available, automatically switching to CPU") - device = "cpu" - compute_type = "int8" + logger.error("CUDA requested but not available") + raise RuntimeError("CUDA not available but explicitly requested") if compute_type not in ["float16", "int8"]: raise ValueError(f"Invalid compute type: {compute_type}. Valid compute types: float16, int8") @@ -65,8 +88,9 @@ def get_whisper_model(model_name: str, device: str, compute_type: str) -> Dict[s logger.info(f"Using cached model instance: {model_key}") return model_instances[model_key] - # Clean GPU memory (if using CUDA) + # Test GPU driver before loading model and clean if device == "cuda": + test_gpu_driver() torch.cuda.empty_cache() # Instantiate model @@ -173,4 +197,4 @@ def get_model_info() -> str: "memory_available": f"{torch.cuda.get_device_properties(0).total_memory / 1e9 - torch.cuda.memory_allocated() / 1e9:.2f} GB" } - return json.dumps(info, indent=2) \ No newline at end of file + return json.dumps(info, indent=2) diff --git a/run_server.sh b/run_server.sh index 0112af3..04a0bb7 100755 --- a/run_server.sh +++ b/run_server.sh @@ -10,12 +10,13 @@ USER_ID=$(id -u) GROUP_ID=$(id -g) # Set environment variables +export CUDA_VISIBLE_DEVICES=1 export WHISPER_MODEL_DIR="/home/uad/agents/tools/mcp-transcriptor/data/models" -export TRANSCRIPTION_OUTPUT_DIR="/home/uad/agents/tools/mcp-transcriptor/data/transcripts" -export TRANSCRIPTION_BATCH_OUTPUT_DIR="/home/uad/agents/tools/mcp-transcriptor/data/transcripts/batch" -export TRANSCRIPTION_MODEL="base" -export TRANSCRIPTION_DEVICE="auto" -export TRANSCRIPTION_COMPUTE_TYPE="auto" +export TRANSCRIPTION_OUTPUT_DIR="/media/raid/agents/tools/mcp-transcriptor/outputs" +export TRANSCRIPTION_BATCH_OUTPUT_DIR="/media/raid/agents/tools/mcp-transcriptor/outputs/batch" +export TRANSCRIPTION_MODEL="large-v3" +export TRANSCRIPTION_DEVICE="cuda" +export TRANSCRIPTION_COMPUTE_TYPE="cuda" export TRANSCRIPTION_OUTPUT_FORMAT="txt" export TRANSCRIPTION_BEAM_SIZE="2" export TRANSCRIPTION_TEMPERATURE="0.0" @@ -24,6 +25,7 @@ export TRANSCRIPTION_FILENAME_PREFIX="test_" # Log start of the script echo "$(datetime_prefix) Starting whisper server script..." +echo "test: $WHISPER_MODEL_DIR" # Optional: Verify required directories exist if [ ! -d "$WHISPER_MODEL_DIR" ]; then @@ -32,5 +34,4 @@ if [ ! -d "$WHISPER_MODEL_DIR" ]; then fi # Run the Python script with the defined environment variables -sudo /home/uad/agents/tools/mcp-transcriptor/venv/bin/python \ - /home/uad/agents/tools/mcp-transcriptor/whisper_server.py +/home/uad/agents/tools/mcp-transcriptor/venv/bin/python /home/uad/agents/tools/mcp-transcriptor/whisper_server.py 2>&1 | tee /home/uad/agents/tools/mcp-transcriptor/mcp.logs diff --git a/transcriber.py b/transcriber.py index ea65e90..fa70169 100644 --- a/transcriber.py +++ b/transcriber.py @@ -19,9 +19,9 @@ logger = logging.getLogger(__name__) # Environment variable defaults DEFAULT_OUTPUT_DIR = os.getenv('TRANSCRIPTION_OUTPUT_DIR', None) DEFAULT_BATCH_OUTPUT_DIR = os.getenv('TRANSCRIPTION_BATCH_OUTPUT_DIR', None) -DEFAULT_MODEL = os.getenv('TRANSCRIPTION_MODEL', 'large-v3') -DEFAULT_DEVICE = os.getenv('TRANSCRIPTION_DEVICE', 'auto') -DEFAULT_COMPUTE_TYPE = os.getenv('TRANSCRIPTION_COMPUTE_TYPE', 'auto') +DEFAULT_MODEL = os.getenv('TRANSCRIPTION_MODEL', 'base') +DEFAULT_DEVICE = os.getenv('TRANSCRIPTION_DEVICE', 'cuda') +DEFAULT_COMPUTE_TYPE = os.getenv('TRANSCRIPTION_COMPUTE_TYPE', 'base') DEFAULT_LANGUAGE = os.getenv('TRANSCRIPTION_LANGUAGE', None) DEFAULT_OUTPUT_FORMAT = os.getenv('TRANSCRIPTION_OUTPUT_FORMAT', 'txt') DEFAULT_BEAM_SIZE = int(os.getenv('TRANSCRIPTION_BEAM_SIZE', '5')) @@ -31,7 +31,7 @@ DEFAULT_TEMPERATURE = float(os.getenv('TRANSCRIPTION_TEMPERATURE', '0.0')) WHISPER_MODEL_DIR = os.getenv('WHISPER_MODEL_DIR', None) # File naming configuration -USE_TIMESTAMP = os.getenv('TRANSCRIPTION_USE_TIMESTAMP', 'true').lower() == 'true' +USE_TIMESTAMP = os.getenv('TRANSCRIPTION_USE_TIMESTAMP', 'false').lower() == 'true' FILENAME_PREFIX = os.getenv('TRANSCRIPTION_FILENAME_PREFIX', '') FILENAME_SUFFIX = os.getenv('TRANSCRIPTION_FILENAME_SUFFIX', '') @@ -97,6 +97,7 @@ def transcribe_audio( # Set transcription parameters options = { + "verbose": True, "language": language, "vad_filter": True, "vad_parameters": {"min_silence_duration_ms": 500}, @@ -106,7 +107,7 @@ def transcribe_audio( "word_timestamps": True, "suppress_tokens": [-1], "condition_on_previous_text": True, - "compression_ratio_threshold": 2.4 + "compression_ratio_threshold": 2.4, } start_time = time.time()